def main(): log.info("Deleting old distribution...") shutil.rmtree(BUILD_DIR, ignore_errors=True) shutil.rmtree(DIST_SUBDIR, ignore_errors=True) # NOT DIST_DIR os.makedirs(BUILD_DIR, exist_ok=True) os.makedirs(DIST_DIR, exist_ok=True) log.info("Building new distribution...") args = (['pyinstaller', '--clean', '--log-level=INFO'] + PYINSTALLER_EXTRA_OPTIONS + [SPECFILE]) with pushd(CWD_FOR_PYINSTALLER): log.debug("In directory: {}".format(os.getcwd())) log.debug("Running PyInstaller with args: {!r}".format(args)) subprocess.check_call(args) log.info("Zipping to {!r}...".format(ZIPFILEBASE)) zipfile = shutil.make_archive(ZIPFILEBASE, ZIPFORMAT, DIST_SUBDIR) log.info(""" The {DIST_SUBDIR} directory should contain everything you need to run. Run with: {LAUNCHFILE} Look for warnings in: {WARNFILE} To distribute, use {zipfile} """.format( DIST_SUBDIR=DIST_SUBDIR, LAUNCHFILE=LAUNCHFILE, WARNFILE=WARNFILE, zipfile=zipfile, ))
def untar_to_directory(tarfile: str, directory: str, verbose: bool = False, gzipped: bool = False, skip_if_dir_exists: bool = True, run_func: RunFuncType = None, chdir_via_python: bool = True, tar_executable: str = None, tar_supports_force_local: bool = None) -> None: """ Unpacks a TAR file into a specified directory. Args: tarfile: filename of the ``.tar`` file directory: destination directory verbose: be verbose? gzipped: is the ``.tar`` also gzipped, e.g. a ``.tar.gz`` file? skip_if_dir_exists: don't do anything if the destrination directory exists? run_func: function to use to call an external command chdir_via_python: change directory via Python, not via ``tar``. Consider using this via Windows, because Cygwin ``tar`` v1.29 falls over when given a Windows path for its ``-C`` (or ``--directory``) option. tar_executable: name of the ``tar`` executable (default is ``tar``) tar_supports_force_local: does tar support the ``--force-local`` switch? If you pass ``None`` (the default), this is checked directly via ``tar --help``. Linux/GNU tar does; MacOS tar doesn't; Cygwin tar does; Windows 10 (build 17063+) tar doesn't. """ if skip_if_dir_exists and os.path.isdir(directory): log.info("Skipping extraction of {} as directory {} exists", tarfile, directory) return tar = which_and_require(tar_executable or "tar") if tar_supports_force_local is None: tar_supports_force_local = tar_supports_force_local_switch(tar) log.info("Extracting {} -> {}", tarfile, directory) mkdir_p(directory) args = [tar, "-x"] # -x: extract if verbose: args.append("-v") # -v: verbose if gzipped: args.append("-z") # -z: decompress using gzip if tar_supports_force_local: args.append("--force-local") # allows filenames with colons in args.extend(["-f", tarfile]) # -f: filename follows if chdir_via_python: with pushd(directory): run_func(args) else: # chdir via tar args.extend(["-C", directory]) # -C: change to directory run_func(args)
def prepare_umls_for_bioyodie(cfg: UmlsBioyodieConversionConfig) -> None: """ Prepare downloaded UMLS data for Bio-YODIE, according to the instructions at https://github.com/GateNLP/bio-yodie-resource-prep. """ # ------------------------------------------------------------------------- # Parameter checks # ------------------------------------------------------------------------- assert cfg.java_home assert cfg.gate_home # ------------------------------------------------------------------------- # Establish the release (version) # ------------------------------------------------------------------------- # There are two releases per year, e.g. 2017AA and 2017AB. release_regex = regex.compile(r"umls-(\d\d\d\dA[AB])-full.zip") umls_zip_basename = os.path.basename(cfg.umls_zip) try: release = release_regex.match(umls_zip_basename).group(1) except AttributeError: # 'NoneType' object has no attribute 'group' release = None # for type-checker only (below) die(f"Unable to work out UMLS release from filename: " f"{umls_zip_basename!r}") # ------------------------------------------------------------------------- # Directory names # ------------------------------------------------------------------------- umls_root_dir = join(cfg.tmp_dir, "umls_data_with_mmsys") umls_metadir = umls_root_dir umls_mmsys_home = umls_metadir # ... because the GUI installer wants "release.dat" (which is in the root # and config/2017AA directories of "mmsys.zip") to be in the same directory # as the Metathesaurus files. Do NOT put it in a "MMSYS" subdirectory, # despite # https://www.nlm.nih.gov/research/umls/implementation_resources/community/mmsys/BatchMRCXTBuilder.html umls_lib_dir = join(umls_mmsys_home, "lib") umls_plugins_dir = join(umls_mmsys_home, "plugins") umls_output_dir = join(cfg.tmp_dir, "umls_output") # ... Where we tell it to store data. # Log files and other output go here. bioyodie_repo_dir = join(cfg.tmp_dir, "bio-yodie-resource-prep") bioyodie_db_dir = join(bioyodie_repo_dir, "databases") bioyodie_scala_dir = join(bioyodie_repo_dir, "scala") bioyodie_tmpdata_dir = join(bioyodie_repo_dir, "tmpdata") bioyodie_umls_dir_containing_symlink = join( bioyodie_repo_dir, "srcs", "umls", "2015AB") # hard-coded "2015AB" bioyodie_umls_input_dir = join(bioyodie_umls_dir_containing_symlink, "META") # hard-coded "META" bioyodie_output_dir = join(bioyodie_repo_dir, "output") # ------------------------------------------------------------------------- # Filenames # ------------------------------------------------------------------------- scala_tgz = join(bioyodie_scala_dir, "scala.tgz") builder_script = join(bioyodie_repo_dir, "bin", "all.sh") mmsys_zip = join(umls_root_dir, "mmsys.zip") config_file = join(umls_metadir, "config.properties") boot_config = join(umls_mmsys_home, "etc", "subset.boot.properties") log4j_config = join(umls_mmsys_home, "etc", "rudolf.log4j.properties") # new # noqa system_java_home = cfg.java_home umls_java_home = join(umls_mmsys_home, "jre", "linux") # it brings its own # ------------------------------------------------------------------------- # Checks # ------------------------------------------------------------------------- if os.path.exists(cfg.dest_dir): die(f"Directory already exists: {cfg.dest_dir}") system_unzip = require_external_tool("unzip") # These are required by the Bio-YODIE preprocessor: groovy_executable = cfg.groovy_executable or require_external_tool( "groovy") # noqa require_external_tool("gzip") require_external_tool("zcat") # ------------------------------------------------------------------------- # Environment variables # ------------------------------------------------------------------------- # For UMLS umls_env = os.environ.copy() umls_env[EnvVar.JAVA_HOME] = umls_java_home # For Bio-YODIE preprocessor bioyodie_env = os.environ.copy() bioyodie_env[EnvVar.JAVA_HOME] = system_java_home bioyodie_env[EnvVar.GATE_HOME] = cfg.gate_home groovy_dir = os.path.dirname(os.path.abspath(groovy_executable)) old_path = bioyodie_env.get(EnvVar.PATH, "") new_path_with_groovy = os.pathsep.join(x for x in [groovy_dir, old_path] if x) bioyodie_env[EnvVar.PATH] = new_path_with_groovy # ------------------------------------------------------------------------- log.info("Cloning Bio-YODIE resource prep repository...") # ------------------------------------------------------------------------- check_call_verbose( ["git", "clone", cfg.bioyodie_prep_repo_url, bioyodie_repo_dir]) # ------------------------------------------------------------------------- log.info("Making directories...") # ------------------------------------------------------------------------- mkdir_p(umls_output_dir) mkdir_p(bioyodie_db_dir) # mkdir_p(bioyodie_scala_dir) # already exists mkdir_p(bioyodie_tmpdata_dir) mkdir_p(bioyodie_umls_dir_containing_symlink) mkdir_p(bioyodie_output_dir) # ------------------------------------------------------------------------- log.info("Fetching/building Scala for the BioYODIE processor...") # ------------------------------------------------------------------------- # ... either before we set JAVA_HOME (to use the system Java) or after # we've unpacked MMSYS (which brings its own JRE), but not in between! download(cfg.scala_url, scala_tgz) with pushd(bioyodie_scala_dir): check_call_verbose(["tar", "-xzvf", scala_tgz]) check_call_verbose(["ant"], env=bioyodie_env) # ------------------------------------------------------------------------- log.info("Unzipping UMLS data...") # ------------------------------------------------------------------------- check_call_verbose(["unzip", "-j", cfg.umls_zip, "-d", umls_root_dir]) # -j: junk paths (extract "flat" into the specified directory) # ------------------------------------------------------------------------- log.info("Unzipping UMLS MetamorphoSys (MMSYS) program (and its JRE)...") # ------------------------------------------------------------------------- check_call_verbose(["unzip", mmsys_zip, "-d", umls_mmsys_home]) # "To ensure proper functionality users must unzip mmsys.zip to the same # directory as the other downloaded files." # -- https://www.ncbi.nlm.nih.gov/books/NBK9683/ # ... but see also example above. # ------------------------------------------------------------------------- log.info("Running MetamorphoSys in batch mode...") # ------------------------------------------------------------------------- # https://www.nlm.nih.gov/research/umls/implementation_resources/community/mmsys/BatchMetaMorphoSys.html # noqa classpath = ":".join([ umls_mmsys_home, umls_plugins_dir, # RNC extra join(umls_lib_dir, "jpf-boot.jar"), join(umls_lib_dir, "jpf.jar"), # RNC extra # You can use "dir/*" to mean "all JAR files in a directory": # https://en.wikipedia.org/wiki/Classpath join(umls_plugins_dir, "gov.nih.nlm.umls.meta", "lib", "*"), # RNC extra # noqa join(umls_plugins_dir, "gov.nih.nlm.umls.mmsys", "lib", "*"), # RNC extra # noqa join(umls_plugins_dir, "gov.nih.nlm.umls.mmsys.gui", "lib", "*"), # RNC extra # noqa join(umls_plugins_dir, "gov.nih.nlm.umls.mmsys.io", "lib", "*"), # RNC extra # noqa join(umls_plugins_dir, "gov.nih.nlm.umls.util", "lib", "*"), # RNC extra # noqa ]) write_text( config_file, get_mmsys_configfile_text(metadir=umls_metadir, mmsys_home=umls_mmsys_home, release=release)) write_text(log4j_config, LOG4J_PROPERTIES_TEXT) with pushd(umls_mmsys_home): log.warning(f"The next step is slow, and doesn't say much. " f"It produces roughly 29 Gb at peak. " f"Watch progress with: " f"watch 'du -bc {cfg.tmp_dir} | tail -1'") check_call_verbose( [ join(cfg.java_home, "bin", "java"), "-classpath", classpath, "-Djava.awt.headless=true", f"-Djpf.boot.config={boot_config}", f"-Dlog4j.configurationFile={log4j_config}", # not "log4j.configuration" as in the original! Argh. # http://logging.apache.org/log4j/2.x/manual/configuration.html f"-Dinput.uri={umls_metadir}", f"-Doutput.uri={umls_output_dir}", f"-Dmmsys.config.uri={config_file}", # Additional from run_linux.sh: "-client", # JVM option: client rather than server mode "-Dunzip.native=true", f"-Dunzip.path={system_unzip}", "-Dfile.encoding=UTF-8", "-Xms1000M", # was 300M, but it's 1000M in run_linux.sh "-Xmx2000M", # was 1000M, but it's 2000M in run_linux.sh "org.java.plugin.boot.Boot" ], env=umls_env) # ------------------------------------------------------------------------- log.info("Converting UMLS data to Bio-YODIE format...") # ------------------------------------------------------------------------- os.symlink(src=umls_output_dir, dst=bioyodie_umls_input_dir, target_is_directory=True) with pushd(bioyodie_repo_dir): log.warning("The next step is also slow.") check_call_verbose([builder_script], env=bioyodie_env) # ------------------------------------------------------------------------- log.info(f"Moving Bio-YODIE data to destination directory: {cfg.dest_dir}") # ------------------------------------------------------------------------- output_files = os.listdir(bioyodie_output_dir) if output_files: shutil.copytree(bioyodie_output_dir, cfg.dest_dir) # ... destination should not already exist # ... it will make intermediate directories happily else: log.error(f"No output files in {bioyodie_output_dir}! " f"Did the Bio-YODIE preprocessor partly crash?")
def launch_slurm(jobname: str, cmd: str, memory_mb: int, project: str, qos: str, email: str, duration: timedelta, tasks_per_node: int, cpus_per_task: int, partition: str = "", modules: List[str] = None, directory: str = os.getcwd(), encoding: str = "ascii") -> None: """ Launch a job into the SLURM environment. Args: jobname: name of the job cmd: command to be executed memory_mb: maximum memory requirement per process (Mb) project: project name qos: quality-of-service name email: user's e-mail address duration: maximum duration per job tasks_per_node: tasks per (cluster) node cpus_per_task: CPUs per task partition: cluster partition name modules: SLURM modules to load directory: directory to change to encoding: encoding to apply to launch script as sent to ``sbatch`` """ if partition: partition_cmd = f"#SBATCH -p {partition}" else: partition_cmd = "" if modules is None: modules = ["default-wbic"] log.info("Launching SLURM job: {}", jobname) script = f"""#!/bin/bash #! Name of the job: #SBATCH -J {jobname} #! Which project should jobs run under: #SBATCH -A {project} #! What QoS [Quality of Service] should the job run in? #SBATCH --qos={qos} #! How much resource should be allocated? #SBATCH --tasks-per-node={tasks_per_node} #SBATCH --cpus-per-task={cpus_per_task} #! Memory requirements #SBATCH --mem={memory_mb} #! How much wall-clock time will be required? #SBATCH --time={strfdelta(duration, SLURM_TIMEDELTA_FMT)} #! What e-mail address to use for notifications? #SBATCH --mail-user={email} #! What types of email messages do you wish to receive? #SBATCH --mail-type=ALL #! Uncomment this to prevent the job from being requeued (e.g. if #! interrupted by node failure or system downtime): #! SBATCH --no-requeue #! Partition {partition_cmd} #! sbatch directives end here (put any additional directives above this line) #! ############################################################ #! Modify the settings below to specify the application's environment, location #! and launch method: #! Optionally modify the environment seen by the application #! (note that SLURM reproduces the environment at submission irrespective of ~/.bashrc): . /etc/profile.d/modules.sh # Leave this line (enables the module command) module purge # Removes all modules still loaded module load {" ".join(modules)} # Basic one, e.g. default-wbic, is REQUIRED - loads the basic environment #! Insert additional module load commands after this line if needed: #! Full path to your application executable: application="hostname" #! Run options for the application: options="" #! Work directory (i.e. where the job will run): workdir="$SLURM_SUBMIT_DIR" # The value of SLURM_SUBMIT_DIR sets workdir to the directory # in which sbatch is run. #! Are you using OpenMP (NB this is **unrelated to OpenMPI**)? If so increase this #! safe value to no more than 24: export OMP_NUM_THREADS=24 # Command line to be submited by SLURM: CMD="{cmd}" ############################################################### ### You should not have to change anything below this line #### ############################################################### cd $workdir echo -e "Changed directory to `pwd`.\n" JOBID=$SLURM_JOB_ID echo -e "JobID: $JOBID\n======" echo "Time: `date`" echo "Running on master node: `hostname`" echo "Current directory: `pwd`" if [ "$SLURM_JOB_NODELIST" ]; then #! Create a machine file: export NODEFILE=`/usr/bin/generate_pbs_nodefile` cat $NODEFILE | uniq > machine.file.$JOBID echo -e "\nNodes allocated:\n================" echo `cat machine.file.$JOBID | sed -e 's/\..*$//g'` fi echo -e "\nExecuting command:\n==================\n$CMD\n" eval $CMD """ # noqa cmdargs = ["sbatch"] with pushd(directory): p = Popen(cmdargs, stdin=PIPE) p.communicate(input=script.encode(encoding))