def test_13(self): scripts1 = [pytest.helpers.get_py_script(i, 1) for i in range(2)] sc = ScriptCollector(scripts1) assert sc.scripts == scripts1 scripts2 = [pytest.helpers.get_py_script(i, 1) for i in range(2)] sc.scripts = scripts2 assert sc.scripts == scripts2
def test_15(self): scripts = [pytest.helpers.get_py_script(i, 1) for i in range(2)] sc = ScriptCollector(scripts) assert sc.scripts == scripts sc.dump() paths = [s.path for s in scripts] assert all(os.path.isfile(p) for p in paths) pytest.helpers.unlink(paths)
def test_6(self): scripts = [pytest.helpers.get_py_script(i, 1) for i in range(2)] [s.write() for s in scripts] sc = ScriptCollector([s.path for s in scripts]) assert len(sc.scripts) == 2 assert all(isinstance(s, Script) for s in sc) pytest.helpers.unlink([s.path for s in scripts])
def test_12(self): container = ScriptCollector(None) task = MockTask(container) assert task.script_collector is container assert len(task.script_collector) == 0 task.add_script(pytest.helpers.get_py_script(0, 1)) assert len(task.script_collector) == 1
def test_4(self): script = pytest.helpers.get_py_script(0, 1) script.write() sc = ScriptCollector(script.path) assert len(sc.scripts) == 1 assert isinstance(sc.scripts[0], Script) pytest.helpers.unlink([script.path])
def write_mrbump_files(ensemble_pdbs, amoptd, job_time=MRBUMP_RUNTIME, ensemble_options=None, directory=None): """Write the MRBUMP job files for all the ensembles. Arguments: ensemble_pdbs -- list of the ensembles, each a single pdb file. amoptd -- dictionary with job options. job_time -- maximum permissible runtime (mainly used for batch queueing systems). ensemble_options -- dictionary with ensemble-specific keywords e.g. ensemble_options[ensemble_name] = {'ncopies' : ncopies} directory -- working directory to write files to. """ if not directory: directory = os.getcwd() collector = ScriptCollector(None) keyword_options = {} for ensemble_pdb in ensemble_pdbs: name = os.path.splitext( os.path.basename(ensemble_pdb))[0] # Get name from pdb path # Get any options specific to this ensemble if ensemble_options and name in ensemble_options: keyword_options = ensemble_options[name] # Generate dictionary with all the options for this job and write to keyword file keyword_dict = mrbump_cmd.keyword_dict(ensemble_pdb, name, amoptd, keyword_options) keyword_file = os.path.join(directory, name + '.mrbump') keyword_str = mrbump_cmd.mrbump_keyword_file(keyword_dict) with open(keyword_file, 'w') as f: f.write(keyword_str) script = write_jobscript(name, keyword_file, amoptd, directory=directory, job_time=job_time) collector.add(script) if not len(collector.scripts): raise RuntimeError("No job scripts created!") return collector
def test_14(self): container = ScriptCollector(pytest.helpers.get_py_script(10, 1)) task = MockTask(container) assert len(task.script_collector) == 1 task.lock() assert task.locked with pytest.raises(PyJobError): task.add_script(pytest.helpers.get_py_script(1, 1)) assert len(task.script_collector) == 1
def _create_scripts(self, rosetta_dir, **kwargs): """Create scripts and set path to working directory""" collector = ScriptCollector(None) owd = os.getcwd() for name in self.test_dict.keys(): os.chdir(self.run_dir) work_dir = os.path.join(self.run_dir, name) args = self.test_dict[name]['args'] # Rosetta is the only think likely to change between platforms so we update the entry if rosetta_dir and self._is_in_args('-rosetta_dir', args): args = self._update_args(args, [['-rosetta_dir', rosetta_dir]]) # Additional argumenst for submitting to a cluster args = self._update_cluster_args(args, **kwargs) if EXTRA_ARGS: args = self._update_args(args, EXTRA_ARGS) # We track different modules using the name of the test case if name.startswith(ENSEMBLER): testcase_type = ENSEMBLER elif name.startswith(MODELLING): testcase_type = MODELLING else: testcase_type = 'ample' if testcase_type != 'ample' and sys.platform.startswith('win'): logger.critical( "Cannot run module testcases on windows due to multiprocessing bug" ) continue script = self.write_script(self.run_dir, name, args + [['-work_dir', work_dir]], testcase_type) collector.add(script) # Set path to the directory the case is run so we can pass it to the unittest self.test_dict[name]['work_dir'] = work_dir # Run the setup function if one is provided if 'setup' in self.test_dict[name] and callable( self.test_dict[name]['setup']): self.test_dict[name]['setup'](self.run_dir) os.chdir(owd) # Back to where we started return collector
def __init__(self, script, *args, **kwargs): """Instantiate a new :obj:`~pyjob.task.Task` Parameters ---------- script : :obj:`~pyjob.script.ScriptCollector`, :obj:`~pyjob.script.Script`, str, list, tuple A :obj:`str`, :obj:`list` or :obj:`tuple` of one or more script paths """ self.pid = None self.locked = False if isinstance(script, ScriptCollector): self.script_collector = script else: self.script_collector = ScriptCollector(script) self.directory = os.path.abspath( kwargs.get("directory") or config.get("directory") or "." ) self.nprocesses = kwargs.get("processes") or config.get("processes") or 1
def create_ensemble_db(database, pdb_db, nproc=2, submit_qtype=None, submit_queue=False, chunk_size=5000): """Create the MoRDa search database Parameters ---------- database : str The path to the database folder pdb_db : str The path to a local copy of the Protein Data Bank nproc : int, optional The number of processors [default: 2] submit_qtype : str The cluster submission queue type - currently support SGE and LSF submit_queue : str The queue to submit to on the cluster chunk_size : int, optional The number of jobs to submit at the same time [default: 5000] Raises ------ RuntimeError Windows is currently not supported """ if CUSTOM_PLATFORM == "windows": msg = "Windows is currently not supported" raise RuntimeError(msg) if not is_valid_db_location(database): raise RuntimeError("Permission denied! Cannot write to {}!".format( os.path.dirname(database))) if "MRD_DB" in os.environ: morda_installed_through_ccp4 = True else: download_morda() morda_installed_through_ccp4 = False morda_dat_path = os.path.join(os.environ['MRD_DB'], 'home', 'ca_DOM', '*.dat') simbad_dat_path = os.path.join(database, '**', '*.dat') morda_dat_files = set( [os.path.basename(f) for f in glob.glob(morda_dat_path)]) simbad_dat_files = set( [os.path.basename(f) for f in glob.glob(simbad_dat_path)]) erroneous_files = { "1bbzA_0.dat", "1gt0D_0.dat", "1h3oA_0.dat", "1kskA_1.dat", "1l0sA_0.dat" } def delete_erroneous_files(erroneous_paths): for f in erroneous_paths: if os.path.isfile(f): logger.warning( "File flagged to be erroneous ... " + "removing from database: %s", f) os.remove(f) erroneous_paths = [ os.path.join(database, name[1:3], name) for name in erroneous_files ] delete_erroneous_files(erroneous_paths) dat_files = list(morda_dat_files - simbad_dat_files - erroneous_files) if len(dat_files) < 1: logger.info('SIMBAD ensemble database up-to-date') if not morda_installed_through_ccp4: shutil.rmtree(os.environ["MRD_DB"]) leave_timestamp(os.path.join(database, 'simbad_morda.txt')) return else: logger.info( "%d new entries were found in the MoRDa database, " + "updating SIMBAD ensemble database", len(dat_files)) exe = os.path.join(os.environ["MRD_PROG"], "get_model") mrbump_stdin = """ MDLS True MDLC False MDLD False MDLP False MDLM False MDLU False CHECK False UPDATE False PICKLE False MRNUM 5 SCOP False DEBUG False RLEVEL 100 GESAMT_MERGE False USEE True GESE True GEST True AMPT False DOPHMMER True DOHHPRED False PDBLOCAL {} END """.format(pdb_db) run_dir = tmp_dir(directory=os.getcwd()) # Generate the sub directories in advance sub_dir_names = set( [os.path.basename(f).rsplit('.', 1)[0][1:3] for f in dat_files]) for sub_dir_name in sub_dir_names: sub_dir = os.path.join(database, sub_dir_name) if os.path.isdir(sub_dir): continue os.makedirs(sub_dir) # Submit in chunks, so we don't take too much disk space # and can terminate without loosing the processed data total_chunk_cycles = len(dat_files) // chunk_size + (len(dat_files) % 5 > 0) for cycle, i in enumerate(range(0, len(dat_files), chunk_size)): logger.info("Working on chunk %d out of %d", cycle + 1, total_chunk_cycles) chunk_dat_files = dat_files[i:i + chunk_size] # Create the database files files = [] collector = ScriptCollector(None) for f in chunk_dat_files: code = os.path.basename(f).rsplit('.', 1)[0] final_file = os.path.join(database, code[1:3], code + ".dat") # We need a temporary directory within because "get_model" uses non-unique file names tmp_d = tmp_dir(directory=run_dir) get_model_output = os.path.join(tmp_d, code + ".pdb") get_seq_output = os.path.join(tmp_d, code + ".seq") mrbump_directory = os.path.join(tmp_d, 'search_mrbump_1') cmd = [["export CCP4_SCR=".format(tmp_d)], ["export MRD_DB=".format(os.environ['MRD_DB'])], ["cd", tmp_d], [exe, "-c", code, "-m", "d"], [ 'ccp4-python', '-c', "'import simbad.util; " "simbad.util.get_sequence(\"{0}\", \"{1}\")'".format( get_model_output, get_seq_output) ], ['mrbump', 'seqin', get_seq_output, '<< eof'], [mrbump_stdin], ['eof'], [ 'ccp4-python', '-c', "'import simbad.util; " "simbad.util.get_mrbump_ensemble(\"{0}\", \"{1}\")'". format(mrbump_directory, final_file) ]] script = Script(directory=tmp_d) for c in cmd: script.append(' '.join(map(str, c))) collector.add(script) log = script.path.rsplit('.', 1)[0] + '.log' files += [(script.path, log, tmp_d)] scripts, _, tmps = zip(*files) submit_chunk(collector=collector, run_dir=os.getcwd(), nproc=nproc, job_name='ensemble_db', submit_qtype=submit_qtype, submit_queue=submit_queue, permit_nonzero=True, monitor=None, success_func=None) for d in tmps: shutil.rmtree(d) shutil.rmtree(run_dir) if not morda_installed_through_ccp4: shutil.rmtree(os.environ["MRD_DB"]) validate_compressed_database(database) leave_timestamp(os.path.join(database, 'simbad_ensemble.txt'))
def test_11(self): container = ScriptCollector(None) task = MockTask(container) assert task.script_collector is container
def run(self, models_dir, nproc=2, shres=3.0, pklim=0.5, npic=50, rotastep=1.0, min_solvent_content=20, submit_nproc=None, submit_qtype=None, submit_queue=None, monitor=None, chunk_size=0, **kwargs): """Run amore rotation function on a directory of models Parameters ---------- models_dir : str The directory containing the models to run the rotation search on nproc : int, optional The number of processors to run the job on shres : int, float, optional Spherical harmonic resolution [default 3.0] pklim : int, float, optional Peak limit, output all peaks above <float> [default: 0.5] npic : int, optional Number of peaks to output from the translation function map for each orientation [default: 50] rotastep : int, float, optional Size of rotation step [default : 1.0] min_solvent_content : int, float, optional The minimum solvent content present in the unit cell with the input model [default: 30] submit_nproc : int The number of processors to use on the head node when creating submission scripts on a cluster [default: 1] submit_qtype : str The cluster submission queue type - currently support SGE and LSF submit_queue : str The queue to submit to on the cluster monitor chunk_size : int, optional The number of jobs to submit at the same time Returns ------- file log file for each model in the models_dir """ self.shres = shres self.pklim = pklim self.npic = npic self.rotastep = rotastep self.submit_qtype = submit_qtype self.submit_queue = submit_queue self.simbad_dat_files = simbad.db.find_simbad_dat_files(models_dir) mtz_labels = simbad.util.mtz_util.GetLabels(self.mtz) i = InputMR_DAT() i.setHKLI(self.mtz) i.setLABI_F_SIGF(mtz_labels.f, mtz_labels.sigf) i.setMUTE(True) run_mr_data = runMR_DAT(i) sg = run_mr_data.getSpaceGroupName().replace(" ", "") cell = " ".join(map(str, run_mr_data.getUnitCell())) sol_calc = simbad.util.matthews_prob.SolventContent(cell, sg) dir_name = "simbad-tmp-" + str(uuid.uuid1()) self.script_log_dir = os.path.join(self.work_dir, dir_name) os.mkdir(self.script_log_dir) self.hklpck0 = self._generate_hklpck0() self.ccp4_scr = os.environ["CCP4_SCR"] default_tmp_dir = os.path.join(self.work_dir, 'tmp') if self.tmp_dir: self.template_tmp_dir = os.path.join(self.tmp_dir, dir_name + "-{0}") else: self.template_tmp_dir = os.path.join(default_tmp_dir, dir_name + "-{0}") predicted_molecular_weight = 0 if run_mr_data.Success(): i = InputCCA() i.setSPAC_HALL(run_mr_data.getSpaceGroupHall()) i.setCELL6(run_mr_data.getUnitCell()) i.setMUTE(True) run_cca = runCCA(i) if run_cca.Success(): predicted_molecular_weight = run_cca.getAssemblyMW() dat_models = [] for dat_model in self.simbad_dat_files: name = os.path.basename(dat_model.replace(".dat", "")) pdb_struct = simbad.util.pdb_util.PdbStructure() pdb_struct.from_file(dat_model) try: solvent_content = sol_calc.calculate_from_struct(pdb_struct) if solvent_content < min_solvent_content: msg = "Skipping %s: solvent content is predicted to be less than %.2f" logger.debug(msg, name, min_solvent_content) continue except ValueError: msg = "Skipping %s: Error calculating solvent content" logger.debug(msg, name) continue except IndexError: msg = "Skipping %s: Problem with dat file" logger.debug(msg, name) continue x, y, z, intrad = pdb_struct.integration_box model_molecular_weight = pdb_struct.molecular_weight mw_diff = abs(predicted_molecular_weight - model_molecular_weight) info = simbad.core.dat_score.DatModelScore(name, dat_model, mw_diff, x, y, z, intrad, solvent_content, None) dat_models.append(info) sorted_dat_models = sorted(dat_models, key=lambda x: float(x.mw_diff), reverse=False) n_files = len(sorted_dat_models) chunk_size = simbad.rotsearch.get_chunk_size(n_files, chunk_size) total_chunk_cycles = simbad.rotsearch.get_total_chunk_cycles( n_files, chunk_size) if submit_qtype == 'local': processes = nproc else: processes = submit_nproc results = [] iteration_range = range(0, n_files, chunk_size) for cycle, i in enumerate(iteration_range): logger.info("Working on chunk %d out of %d", cycle + 1, total_chunk_cycles) if self.solution: logger.info( "Early termination criteria met, skipping chunk %d", cycle + 1) continue collector = ScriptCollector(None) amore_files = [] with pool.Pool(processes=processes) as p: [(collector.add(i[0]), amore_files.append(i[1])) for i in p.map(self, sorted_dat_models[i:i + chunk_size]) if i is not None] if len(collector.scripts) > 0: logger.info("Running AMORE tab/rot functions") amore_logs, dat_models = zip(*amore_files) simbad.util.submit_chunk(collector, self.script_log_dir, nproc, 'simbad_amore', submit_qtype, submit_queue, True, monitor, self.rot_succeeded_log) for dat_model, amore_log in zip(dat_models, amore_logs): base = os.path.basename(amore_log) pdb_code = base.replace("amore_", "").replace(".log", "") try: rotsearch_parser = simbad.parsers.rotsearch_parser.AmoreRotsearchParser( amore_log) score = simbad.core.amore_score.AmoreRotationScore( pdb_code, dat_model, rotsearch_parser.alpha, rotsearch_parser.beta, rotsearch_parser.gamma, rotsearch_parser.cc_f, rotsearch_parser.rf_f, rotsearch_parser.cc_i, rotsearch_parser.cc_p, rotsearch_parser.icp, rotsearch_parser.cc_f_z_score, rotsearch_parser.cc_p_z_score, rotsearch_parser.num_of_rot) if rotsearch_parser.cc_f_z_score: results += [score] except IOError: pass else: logger.critical("No structures to be trialled") self._search_results = results shutil.rmtree(self.script_log_dir) if os.path.isdir(default_tmp_dir): shutil.rmtree(default_tmp_dir)
def comparison(self, models, structures): """ Compare a list of model structures to a second list of reference structures Parameters ---------- models : list List containing the paths to the model structure files structures : list List containing the paths to the reference structure files Returns ------- entries : list List of TMscore data entries on a per-model basis """ if len(models) < 1 or len(structures) < 1: msg = 'No model structures provided' if len(models) < 1 else 'No reference structures provided' logger.critical(msg) raise RuntimeError(msg) elif len(structures) == 1: logger.info('Using single structure provided for all model comparisons') structures = [structures[0] for _ in xrange(len(models))] elif len(models) != len(structures): msg = "Unequal number of models and structures!" logger.critical(msg) raise RuntimeError(msg) if self.method == "tmalign": pt = tm_parser.TMalignLogParser() elif self.method == "tmscore": pt = tm_parser.TMscoreLogParser() else: msg = "Invalid method selected: %s", self.method logger.critical(msg) raise RuntimeError(msg) logger.info('Using algorithm: {0}'.format(self.method)) logger.info('------- Evaluating decoys -------') data_entries, log_files = [], [] collector = ScriptCollector(None) for model_pdb, structure_pdb in zip(models, structures): model_name = os.path.splitext(os.path.basename(model_pdb))[0] structure_name = os.path.splitext(os.path.basename(structure_pdb))[0] stem = "_".join([model_name, structure_name, self.method]) if os.path.isfile(model_pdb) and os.path.isfile(structure_pdb): data_entries.append([model_name, structure_name, model_pdb, structure_pdb]) script = Script(directory=self.tmp_dir, prefic="tmscore_", stem=stem) script.append(" ".join([self.executable, model_pdb, structure_pdb])) collector.add(script) log_files.append(os.path.splitext(script)[0] + ".log") else: if not os.path.isfile(model_pdb): logger.warning("Cannot find: %s", model_pdb) if not os.path.isfile(structure_pdb): logger.warning("Cannot find: %s", structure_pdb) continue logger.info('Executing TManalysis scripts') j = Job(self._qtype) j.submit(job_scripts, nproc=self._nproc, max_array_jobs=self._max_array_jobs, queue=self._queue, name="tmscore") j.wait(interval=1) with TaskFactory( self._qtype, collector, name="tmscore", nprocesses=self._nproc, max_array_size=self._max_array_jobs, queue=self._queue, shell="/bin/bash", ) as task: task.run() task.wait(interval=1) self.entries = [] for entry, log, script in zip(data_entries, log_files, job_scripts): try: pt.reset() pt.parse(log) except Exception: logger.critical("Error processing the %s log file: %s", self.method, log) log = "None" model_name, structure_name, model_pdb, structure_pdb = entry _entry = self._store(model_name, structure_name, model_pdb, structure_pdb, log, pt) self.entries.append(_entry) os.unlink(script) return self.entries
def test_12(self): scripts = [pytest.helpers.get_py_script(i, 1) for i in range(2)] sc = ScriptCollector(scripts) sc.add([]) assert sc.scripts == scripts
class Task(abc.ABC): """Abstract base class for executable tasks""" def __init__(self, script, *args, **kwargs): """Instantiate a new :obj:`~pyjob.task.Task` Parameters ---------- script : :obj:`~pyjob.script.ScriptCollector`, :obj:`~pyjob.script.Script`, str, list, tuple A :obj:`str`, :obj:`list` or :obj:`tuple` of one or more script paths """ self.pid = None self.locked = False if isinstance(script, ScriptCollector): self.script_collector = script else: self.script_collector = ScriptCollector(script) self.directory = os.path.abspath( kwargs.get("directory") or config.get("directory") or "." ) self.nprocesses = kwargs.get("processes") or config.get("processes") or 1 def __del__(self): """Exit function at instance deletion""" if not self.locked: self.lock() self.close() def __enter__(self): """Contextmanager entry function Note ---- For further details see `PEP 343 <https://www.python.org/dev/peps/pep-0343/>`_. """ return self def __exit__(self, *exc): """Contextmanager exit function Note ---- For further details see `PEP 343 <https://www.python.org/dev/peps/pep-0343/>`_. """ if not self.locked: self.lock() self.close() def __repr__(self): """Representation of the :obj:`~pyjob.task.Task`""" return f"{self.__class__.__qualname__}(pid={self.pid})" # ------------------ Abstract methods and properties ------------------ @property @abc.abstractmethod def info(self): # pragma: no cover """Abstract property to provide info about the :obj:`~pyjob.task.Task`""" @abc.abstractmethod def close(self): # pragma: no cover """Abstract method to end :obj:`~pyjob.task.Task`""" @abc.abstractmethod def kill(self): # pragma: no cover """Abstract method to forcefully terminate :obj:`~pyjob.task.Task`""" @abc.abstractmethod def _run(self): # pragma: no cover """Abstract property to start execution of the :obj:`~pyjob.task.Task`""" # ------------------ Other task-specific general methods ------------------ @property def completed(self): """Boolean to indicate :obj:`~pyjob.task.Task` completion""" return self.locked and not bool(self.info) @property def log(self): """The log file path""" return [script.log for script in self.script_collector] @property def script(self): """The script file path""" return [script.path for script in self.script_collector] @staticmethod def get_time(minutes): """Return runtime string with format hh:mm:ss to be used in :obj:`~pyjob.task.Task` Parameters ---------- minutes : int Integer with the number of minutes to allocate to runtime Raises ------ :exc:`~pyjob.exception.PyJobError` Argument is not a positive integer """ if isinstance(minutes, int) and minutes > 0: h, m = divmod(minutes, 60) return f"{h:02d}:{m:02d}:00" else: raise PyJobError("Task runtime has to be a positive integer!") def add_script(self, script): """Add further scripts to this :obj:`~pyjob.task.Task` Parameters ---------- script : :obj:`~pyjob.script.Script`, str, list, tuple Something representing one or more scripts """ if self.locked: raise PyJobTaskLockedError("This task is locked!") self.script_collector.add(script) def lock(self): """Lock this :obj:`~pyjob.task.Task`""" self.locked = True logger.debug("Locked %s [%d]", self.__class__.__qualname__, self.pid) def run(self): """Start the execution of this :obj:`~pyjob.task.Task` Raises ------ :exc:`~pyjob.exception.PyJobError` One or more executable scripts required prior to execution :exc:`~pyjob.exception.PyJobTaskLockedError` Locked task, cannot restart or rerun """ if self.locked: raise PyJobTaskLockedError("This task is locked!") if len(self.script_collector) < 1: raise PyJobError( "One or more executable scripts required prior to execution" ) self.script_collector.dump() self._run() logger.debug( "Started execution of %s [%d]", self.__class__.__qualname__, self.pid ) self.lock() def wait(self, interval=30, monitor_f=None, success_f=None): """Method to wait for the completion of the current :obj:`~pyjob.task.Task` Parameters ---------- interval : int, optional The interval to wait between checking (in seconds) monitor_f : callable, optional A :obj:`callable` that is regularly invoked success_f : callable, optional A :obj:`callable` to check for early termination of :obj:`~pyjob.task.Task` Note ---- The `success_f` argument needs to accept a log file as input and return a :obj:`bool`. """ def is_successful_run(log): return os.path.isfile(log) and success_f(log) def is_callable_fn(fn): return bool(fn and callable(fn)) check_success = is_callable_fn(success_f) callback = monitor_f if is_callable_fn(monitor_f) else lambda: None if check_success: msg = "Checking for %s %d success with function %s" logger.debug(msg, self.__class__.__qualname__, self.pid, success_f.__name__) while not self.completed: if check_success: for log in self.log: if is_successful_run(log): logger.debug( "%s %d succeeded, run log: %s", self.__class__.__qualname__, self.pid, log, ) self.kill() callback() time.sleep(interval)
def test_8(self): with pytest.raises(ValueError): ScriptCollector(["test"])
def test_7(self): with pytest.raises(PyJobError): ScriptCollector([1])
def run(self, models_dir, nproc=2, min_solvent_content=20, submit_nproc=None, submit_qtype=None, submit_queue=None, monitor=None, chunk_size=0, **kwargs): """Run phaser rotation function on a directory of models Parameters ---------- models_dir : str The directory containing the models to run the rotation search on nproc : int, optional The number of processors to run the job on min_solvent_content : int, float, optional The minimum solvent content present in the unit cell with the input model [default: 30] submit_nproc : int The number of processors to use on the head node when creating submission scripts on a cluster [default: 1] submit_qtype : str The cluster submission queue type - currently support SGE and LSF submit_queue : str The queue to submit to on the cluster monitor chunk_size : int, optional The number of jobs to submit at the same time Returns ------- file log file for each model in the models_dir """ self.submit_qtype = submit_qtype self.submit_queue = submit_queue self.mtz_labels = simbad.util.mtz_util.GetLabels(self.mtz) self.simbad_dat_files = simbad.db.find_simbad_dat_files(models_dir) i = InputMR_DAT() i.setHKLI(self.mtz) i.setLABI_F_SIGF(self.mtz_labels.f, self.mtz_labels.sigf) i.setMUTE(True) run_mr_data = runMR_DAT(i) sg = run_mr_data.getSpaceGroupName().replace(" ", "") cell = " ".join(map(str, run_mr_data.getUnitCell())) mat_coef = simbad.util.matthews_prob.MatthewsProbability(cell, sg) dir_name = "simbad-tmp-" + str(uuid.uuid1()) self.script_log_dir = os.path.join(self.work_dir, dir_name) os.mkdir(self.script_log_dir) self.ccp4_scr = os.environ["CCP4_SCR"] default_tmp_dir = os.path.join(self.work_dir, 'tmp') if self.tmp_dir: self.template_tmp_dir = os.path.join(self.tmp_dir, dir_name + "-{0}") else: self.template_tmp_dir = os.path.join(default_tmp_dir, dir_name + "-{0}") predicted_molecular_weight = 0 if run_mr_data.Success(): i = InputCCA() i.setSPAC_HALL(run_mr_data.getSpaceGroupHall()) i.setCELL6(run_mr_data.getUnitCell()) i.setMUTE(True) run_cca = runCCA(i) if run_cca.Success(): predicted_molecular_weight = run_cca.getAssemblyMW() dat_models = [] for dat_model in self.simbad_dat_files: name = os.path.basename(dat_model.replace(".dat", "")) pdb_struct = simbad.util.pdb_util.PdbStructure() pdb_struct.from_file(dat_model) solvent_fraction, n_copies = mat_coef.calculate_content_ncopies_from_struct( pdb_struct) solvent_content = solvent_fraction * 100 if solvent_content < min_solvent_content: msg = "Skipping %s: solvent content is predicted to be less than %.2f" logger.debug(msg, name, min_solvent_content) continue mw_diff = abs(predicted_molecular_weight - pdb_struct.molecular_weight) info = simbad.core.dat_score.DatModelScore(name, dat_model, mw_diff, None, None, None, None, solvent_fraction, n_copies) dat_models.append(info) sorted_dat_models = sorted(dat_models, key=lambda x: float(x.mw_diff), reverse=False) n_files = len(sorted_dat_models) chunk_size = simbad.rotsearch.get_chunk_size(n_files, chunk_size) total_chunk_cycles = simbad.rotsearch.get_total_chunk_cycles( n_files, chunk_size) results = [] iteration_range = range(0, n_files, chunk_size) for cycle, i in enumerate(iteration_range): logger.info("Working on chunk %d out of %d", cycle + 1, total_chunk_cycles) if self.solution: logger.info( "Early termination criteria met, skipping chunk %d", cycle + 1) continue self.template_model = os.path.join("$CCP4_SCR", "{0}.pdb") if submit_qtype == 'local': processes = nproc else: processes = submit_nproc collector = ScriptCollector(None) phaser_files = [] with pool.Pool(processes=processes) as p: [(collector.add(i[0]), phaser_files.append(i[1])) for i in p.map(self, sorted_dat_models[i:i + chunk_size]) if i is not None] if len(phaser_files) > 0: logger.info("Running PHASER rotation functions") phaser_logs, dat_models = zip(*phaser_files) simbad.util.submit_chunk(collector, self.script_log_dir, nproc, 'simbad_phaser', submit_qtype, submit_queue, True, monitor, self.rot_succeeded_log) for dat_model, phaser_log in zip(dat_models, phaser_logs): base = os.path.basename(phaser_log) pdb_code = base.replace("phaser_", "").replace(".log", "") try: phaser_rotation_parser = simbad.parsers.rotsearch_parser.PhaserRotsearchParser( phaser_log) if phaser_rotation_parser.rfact: phaser_rotation_parser.llg = 100 phaser_rotation_parser.rfz = 10 score = simbad.core.phaser_score.PhaserRotationScore( pdb_code, dat_model, phaser_rotation_parser.llg, phaser_rotation_parser.rfz) if phaser_rotation_parser.rfz: results += [score] except IOError: pass else: logger.critical("No structures to be trialled") self._search_results = results shutil.rmtree(self.script_log_dir) if os.path.isdir(default_tmp_dir): shutil.rmtree(default_tmp_dir)
def create_morda_db(database, nproc=2, submit_qtype=None, submit_queue=False, chunk_size=5000): """Create the MoRDa search database Parameters ---------- database : str The path to the database folder nproc : int, optional The number of processors [default: 2] submit_qtype : str The cluster submission queue type - currently support SGE and LSF submit_queue : str The queue to submit to on the cluster chunk_size : int, optional The number of jobs to submit at the same time [default: 5000] Raises ------ RuntimeError Windows is currently not supported """ if CUSTOM_PLATFORM == "windows": msg = "Windows is currently not supported" raise RuntimeError(msg) if not is_valid_db_location(database): raise RuntimeError("Permission denied! Cannot write to {}!".format( os.path.dirname(database))) if "MRD_DB" in os.environ: morda_installed_through_ccp4 = True else: download_morda() morda_installed_through_ccp4 = False morda_dat_path = os.path.join(os.environ['MRD_DB'], 'home', 'ca_DOM', '*.dat') simbad_dat_path = os.path.join(database, '**', '*.dat') morda_dat_files = set( [os.path.basename(f) for f in glob.glob(morda_dat_path)]) simbad_dat_files = set( [os.path.basename(f) for f in glob.glob(simbad_dat_path)]) erroneous_files = { "1bbzA_0.dat", "1gt0D_0.dat", "1h3oA_0.dat", "1kskA_1.dat", "1l0sA_0.dat" } def delete_erroneous_files(erroneous_paths): for f in erroneous_paths: if os.path.isfile(f): logger.warning( "File flagged to be erroneous ... " + "removing from database: %s", f) os.remove(f) erroneous_paths = [ os.path.join(database, name[1:3], name) for name in erroneous_files ] delete_erroneous_files(erroneous_paths) dat_files = list(morda_dat_files - simbad_dat_files - erroneous_files) if len(dat_files) < 1: logger.info('SIMBAD database up-to-date') if not morda_installed_through_ccp4: shutil.rmtree(os.environ["MRD_DB"]) leave_timestamp(os.path.join(database, 'simbad_morda.txt')) return else: logger.info( "%d new entries were found in the MoRDa database, " + "updating SIMBAD database", len(dat_files)) exe = os.path.join(os.environ["MRD_PROG"], "get_model") run_dir = tmp_dir(directory=os.getcwd()) # Submit in chunks, so we don't take too much disk space # and can terminate without loosing the processed data total_chunk_cycles = len(dat_files) // chunk_size + (len(dat_files) % 5 > 0) for cycle, i in enumerate(range(0, len(dat_files), chunk_size)): logger.info("Working on chunk %d out of %d", cycle + 1, total_chunk_cycles) chunk_dat_files = dat_files[i:i + chunk_size] # Create the database files files = [] collector = ScriptCollector(None) for f in chunk_dat_files: code = os.path.basename(f).rsplit('.', 1)[0] final_file = os.path.join(database, code[1:3], code + ".dat") # We need a temporary directory within because "get_model" uses non-unique file names tmp_d = tmp_dir(directory=run_dir) get_model_output = os.path.join(tmp_d, code + ".pdb") cmd = [["export CCP4_SCR=" + tmp_d], ["export MRD_DB=" + os.environ['MRD_DB']], ["cd", tmp_d], [exe, "-c", code, "-m", "d"]] script = Script(directory=tmp_d) for c in cmd: script.append(' '.join(map(str, c))) collector.add(script) log = script.path.rsplit('.', 1)[0] + '.log' files += [(script.path, log, tmp_d, (get_model_output, final_file)) ] scripts, _, tmps, files = zip(*files) submit_chunk(collector=collector, run_dir=os.getcwd(), nproc=nproc, job_name='morda_db', submit_qtype=submit_qtype, submit_queue=submit_queue, permit_nonzero=True, monitor=None, success_func=None) sub_dir_names = set([ os.path.basename(f).rsplit('.', 1)[0][1:3] for f in chunk_dat_files ]) for sub_dir_name in sub_dir_names: sub_dir = os.path.join(database, sub_dir_name) if os.path.isdir(sub_dir): continue os.makedirs(sub_dir) for output, final in files: if os.path.isfile(output): simbad.db.convert_pdb_to_dat(output, final) else: logger.critical("File missing: {}".format(output)) for d in tmps: shutil.rmtree(d) shutil.rmtree(run_dir) if not morda_installed_through_ccp4: shutil.rmtree(os.environ["MRD_DB"]) validate_compressed_database(database) leave_timestamp(os.path.join(database, 'simbad_morda.txt'))
def test_3(self): script = pytest.helpers.get_py_script(0, 1) sc = ScriptCollector(script) assert sc.scripts == [script]
def test_2(self): sc = ScriptCollector(None) assert sc.scripts == []
def test_1(self): sc = ScriptCollector([]) assert sc.scripts == []
def test_13(self): container = ScriptCollector(pytest.helpers.get_py_script(10, 1)) task = MockTask(container) task.add_script([pytest.helpers.get_py_script(i, 1) for i in range(5)]) assert len(task.script_collector) == 6
def create_contaminant_db(database, add_morda_domains, nproc=2, submit_qtype=None, submit_queue=False): """Create a contaminant database Parameters ---------- database : str The path to the database folder add_morda_domains : bool Retrospectively add morda domains to a contaminant database updated when morda was not installed nproc : int, optional The number of processors [default: 2] submit_qtype : str The cluster submission queue type - currently support SGE and LSF submit_queue : str The queue to submit to on the cluster Raises ------ RuntimeError dimple.contaminants.prepare module not available RuntimeError Windows is currently not supported """ if not is_valid_db_location(database): raise RuntimeError("Permission denied! Cannot write to {}!".format( os.path.dirname(database))) import dimple.main logger.info('DIMPLE version: %s', dimple.main.__version__) if StrictVersion(dimple.main.__version__) < StrictVersion('2.5.7'): msg = "This feature will be available with dimple version 2.5.7" raise RuntimeError(msg) if CUSTOM_PLATFORM == "windows": msg = "Windows is currently not supported" raise RuntimeError(msg) import dimple.contaminants.prepare dimple.contaminants.prepare.main(verbose=False) simbad_dat_path = os.path.join(database, '*', '*', '*', '*.dat') existing_dat_files = [ os.path.basename(f).split('.')[0].lower() for f in glob.iglob(simbad_dat_path) ] erroneous_files = ['4v43'] dimple_files = ['cached', 'data.json', 'data.py'] with open("data.json") as data_file: data = json.load(data_file) results = [] for child in data["children"]: try: for child_2 in child["children"]: space_group = child_2["name"].replace(" ", "") for child_3 in child_2["children"]: pdb_code = child_3["name"].split()[0].lower() if (pdb_code in existing_dat_files or pdb_code in erroneous_files) and not add_morda_domains: continue uniprot_name = child["name"] uniprot_mnemonic = uniprot_name.split('_')[1] score = ContaminantSearchResult(pdb_code, space_group, uniprot_name, uniprot_mnemonic) results.append(score) except KeyError: pass if len(results) == 0: logger.info("Contaminant database up to date") else: if add_morda_domains: logger.info("Adding morda domains to contaminant database") else: logger.info( "%d new entries were found in the contaminant database, " + "updating SIMBAD database", len(results)) if "MRD_DB" in os.environ: morda_installed_through_ccp4 = True else: morda_installed_through_ccp4 = False if add_morda_domains and not morda_installed_through_ccp4: logger.critical( "Morda not installed locally, unable to add morda domains to contaminant database" ) if morda_installed_through_ccp4: morda_dat_path = os.path.join(os.environ['MRD_DB'], 'home', 'ca_DOM', '*.dat') morda_dat_files = set( [os.path.basename(f) for f in glob.iglob(morda_dat_path)]) exe = os.path.join(os.environ['MRD_PROG'], "get_model") else: logger.info( "Morda not installed locally, therefore morda domains will not be added to contaminant database" ) files = [] collector = ScriptCollector(None) for result in results: stem = os.path.join(os.getcwd(), database, result.uniprot_mnemonic, result.uniprot_name, result.space_group) if not os.path.exists(stem): os.makedirs(stem) content = PdbStructure.get_pdb_content(result.pdb_code) if content is None: logger.debug( "Encountered a problem downloading PDB %s - skipping entry", result.pdb_code) else: dat_content = simbad.db._str_to_dat(content) with open(os.path.join(stem, result.pdb_code + ".dat"), "w") as f_out: f_out.write(dat_content) if simbad.db.is_valid_dat( os.path.join(stem, result.pdb_code + ".dat")): pass else: logger.debug("Unable to convert %s to dat file", result.pdb_code) if morda_installed_through_ccp4: for dat_file in morda_dat_files: if result.pdb_code.lower() == dat_file[0:4]: stem = os.path.join(database, result.uniprot_mnemonic, result.uniprot_name, result.space_group, "morda") if not os.path.exists(stem): os.makedirs(stem) code = dat_file.rsplit('.', 1)[0] final_file = os.path.join(stem, dat_file) tmp_d = tmp_dir(directory=os.getcwd()) get_model_output = os.path.join(tmp_d, code + ".pdb") cmd = [["export CCP4_SCR=", tmp_d], ["cd", tmp_d], [exe, "-c", code, "-m", "d"]] script = Script(directory=tmp_d) for c in cmd: script.append(' '.join(map(str, c))) collector.add(script) log = script.path.rsplit('.', 1)[0] + '.log' files += [(script.path, log, tmp_d, (get_model_output, final_file))] if len(files) > 0: scripts, _, tmps, files = zip(*files) submit_chunk(collector=collector, run_dir=os.getcwd(), nproc=nproc, job_name='cont_db', submit_qtype=submit_qtype, submit_queue=submit_queue, permit_nonzero=True, monitor=None, success_func=None) for output, final in files: if os.path.isfile(output): simbad.db.convert_pdb_to_dat(output, final) else: print "File missing: {}".format(output) for d in tmps: shutil.rmtree(d) for f in dimple_files: if os.path.isdir(f): shutil.rmtree(f) elif os.path.isfile(f): os.remove(f) validate_compressed_database(database)
def test_9(self): with pytest.raises(IOError): ScriptCollector(["test.sh"])