def test_tmp_dir_6(self): tmp = tmp_dir(directory=os.getcwd(), prefix="first", suffix="last") self.assertTrue(os.path.isdir(tmp)) path, name = os.path.split(tmp) self.assertEqual(os.getcwd(), path) self.assertTrue(name.startswith("first")) self.assertTrue(name.endswith("last")) shutil.rmtree(tmp)
def test_tmp_dir_5(self): tmp = tmp_dir(suffix="simbad.the-last") self.assertTrue(os.path.isdir(tmp)) _, name = os.path.split(tmp) self.assertTrue(name.endswith("simbad.the-last")) shutil.rmtree(tmp)
def test_tmp_dir_3(self): tmp = tmp_dir(prefix="first") self.assertTrue(os.path.isdir(tmp)) _, name = os.path.split(tmp) self.assertTrue(name.startswith("first")) shutil.rmtree(tmp)
def test_tmp_dir_2(self): tmp = tmp_dir(directory=os.getcwd()) self.assertTrue(os.path.isdir(tmp)) path, _ = os.path.split(tmp) self.assertEqual(os.getcwd(), path) shutil.rmtree(tmp)
def test_tmp_dir_1(self): tmp = tmp_dir() self.assertTrue(os.path.isdir(tmp)) shutil.rmtree(tmp)
def create_morda_db(database, nproc=2, submit_qtype=None, submit_queue=False, chunk_size=5000): """Create the MoRDa search database Parameters ---------- database : str The path to the database folder nproc : int, optional The number of processors [default: 2] submit_qtype : str The cluster submission queue type - currently support SGE and LSF submit_queue : str The queue to submit to on the cluster chunk_size : int, optional The number of jobs to submit at the same time [default: 5000] Raises ------ RuntimeError Windows is currently not supported """ if CUSTOM_PLATFORM == "windows": msg = "Windows is currently not supported" raise RuntimeError(msg) if not is_valid_db_location(database): raise RuntimeError("Permission denied! Cannot write to {}!".format(os.path.dirname(database))) if "MRD_DB" in os.environ: morda_installed_through_ccp4 = True else: download_morda() morda_installed_through_ccp4 = False morda_dat_path = os.path.join(os.environ['MRD_DB'], 'home', 'ca_DOM', '*.dat') #simbad_dat_path = os.path.join(database, '**', '*.dat') simbad_pdb_path = os.path.join(database, '**', '*.pdb') morda_dat_files = set([os.path.basename(f) for f in glob.glob(morda_dat_path)]) # simbad_dat_files = set([os.path.basename(f) for f in glob.glob(simbad_dat_path)]) simbad_dat_files = set([os.path.basename(f).split('.')[0] + '.dat' for f in glob.glob(simbad_pdb_path)]) # erroneous_files = set(["1bbzA_0.dat", "1gt0D_0.dat", "1h3oA_0.dat", "1kskA_1.dat", "1l0sA_0.dat"]) erroneous_files = set(["1bbzA_0.pdb", "1gt0D_0.pdb", "1h3oA_0.pdb", "1kskA_1.pdb", "1l0sA_0.pdb"]) def delete_erroneous_files(erroneous_paths): for f in erroneous_paths: if os.path.isfile(f): logger.warning("File flagged to be erroneous ... " + "removing from database: %s", f) os.remove(f) erroneous_paths = [os.path.join(database, name[1:3], name) for name in erroneous_files] delete_erroneous_files(erroneous_paths) dat_files = list(morda_dat_files - simbad_dat_files - erroneous_files) if len(dat_files) < 1: logger.info('SIMBAD database up-to-date') if not morda_installed_through_ccp4: shutil.rmtree(os.environ["MRD_DB"]) leave_timestamp(os.path.join(database, 'simbad_morda.txt')) return else: logger.info("%d new entries were found in the MoRDa database, " + "updating SIMBAD database", len(dat_files)) exe = os.path.join(os.environ["MRD_PROG"], "get_model") run_dir = tmp_dir(directory=os.getcwd()) # Submit in chunks, so we don't take too much disk space # and can terminate without loosing the processed data total_chunk_cycles = len(dat_files) // chunk_size + (len(dat_files) % 5 > 0) for cycle, i in enumerate(range(0, len(dat_files), chunk_size)): logger.info("Working on chunk %d out of %d", cycle + 1, total_chunk_cycles) chunk_dat_files = dat_files[i:i + chunk_size] # Create the database files what_to_do = [] for f in chunk_dat_files: code = os.path.basename(f).rsplit('.', 1)[0] #final_file = os.path.join(database, code[1:3], code + ".dat") final_file = os.path.join(database, code[1:3], code + '.pdb') # We need a temporary directory within because "get_model" uses non-unique file names tmp_d = tmp_dir(directory=run_dir) get_model_output = os.path.join(tmp_d, code + ".pdb") script = make_script( [["export CCP4_SCR=", tmp_d], ["export MRD_DB=" + os.environ['MRD_DB']], ["cd", tmp_d], [exe, "-c", code, "-m", "d"]], directory=tmp_d) log = script.rsplit('.', 1)[0] + '.log' what_to_do += [(script, log, tmp_d, (get_model_output, final_file))] scripts, _, tmps, files = zip(*what_to_do) j = Job(submit_qtype) j.submit(scripts, name='morda_db', nproc=nproc, queue=submit_queue) j.wait() sub_dir_names = set([os.path.basename(f).rsplit('.', 1)[0][1:3] for f in chunk_dat_files]) for sub_dir_name in sub_dir_names: sub_dir = os.path.join(database, sub_dir_name) if os.path.isdir(sub_dir): continue os.makedirs(sub_dir) for output, final in files: if os.path.isfile(output): #simbad.db.convert_pdb_to_dat(output, final) shutil.move(output, final) else: logger.critical("File missing: {}".format(output)) for d in tmps: shutil.rmtree(d) shutil.rmtree(run_dir) if not morda_installed_through_ccp4: shutil.rmtree(os.environ["MRD_DB"]) validate_compressed_database(database) leave_timestamp(os.path.join(database, 'simbad_morda.txt'))
def create_contaminant_db(database, add_morda_domains, nproc=2, submit_qtype=None, submit_queue=False): """Create a contaminant database Parameters ---------- database : str The path to the database folder add_morda_domains : bool Retrospectively add morda domains to a contaminant database updated when morda was not installed nproc : int, optional The number of processors [default: 2] submit_qtype : str The cluster submission queue type - currently support SGE and LSF submit_queue : str The queue to submit to on the cluster Raises ------ RuntimeError dimple.contaminants.prepare module not available RuntimeError Windows is currently not supported """ if not is_valid_db_location(database): raise RuntimeError("Permission denied! Cannot write to {}!".format(os.path.dirname(database))) import dimple.main logger.info('DIMPLE version: %s', dimple.main.__version__) if StrictVersion(dimple.main.__version__) < StrictVersion('2.5.7'): msg = "This feature will be available with dimple version 2.5.7" raise RuntimeError(msg) if CUSTOM_PLATFORM == "windows": msg = "Windows is currently not supported" raise RuntimeError(msg) import dimple.contaminants.prepare dimple.contaminants.prepare.main(verbose=False) simbad_dat_path = os.path.join(database, '*', '*', '*', '*.dat') existing_dat_files = [os.path.basename(f).split('.')[0].lower() for f in glob.iglob(simbad_dat_path)] erroneous_files = ['4v43'] dimple_files = ['cached', 'data.json', 'data.py'] with open("data.json") as data_file: data = json.load(data_file) results = [] for child in data["children"]: try: for child_2 in child["children"]: space_group = child_2["name"].replace(" ", "") for child_3 in child_2["children"]: pdb_code = child_3["name"].split()[0].lower() if (pdb_code in existing_dat_files or pdb_code in erroneous_files) and not add_morda_domains: continue uniprot_name = child["name"] uniprot_mnemonic = uniprot_name.split('_')[1] score = ContaminantSearchResult(pdb_code, space_group, uniprot_name, uniprot_mnemonic) results.append(score) except KeyError: pass if len(results) == 0: logger.info("Contaminant database up to date") else: if add_morda_domains: logger.info("Adding morda domains to contaminant database") else: logger.info("%d new entries were found in the contaminant database, " + "updating SIMBAD database", len(results)) if "MRD_DB" in os.environ: morda_installed_through_ccp4 = True else: morda_installed_through_ccp4 = False if add_morda_domains and not morda_installed_through_ccp4: logger.critical("Morda not installed locally, unable to add morda domains to contaminant database") if morda_installed_through_ccp4: morda_dat_path = os.path.join(os.environ['MRD_DB'], 'home', 'ca_DOM', '*.dat') morda_dat_files = set([os.path.basename(f) for f in glob.iglob(morda_dat_path)]) exe = os.path.join(os.environ['MRD_PROG'], "get_model") else: logger.info( "Morda not installed locally, therefore morda domains will not be added to contaminant database") what_to_do = [] for result in results: stem = os.path.join(os.getcwd(), database, result.uniprot_mnemonic, result.uniprot_name, result.space_group) if not os.path.exists(stem): os.makedirs(stem) content = PdbStructure.get_pdb_content(result.pdb_code) if content is None: logger.debug("Encountered a problem downloading PDB %s - skipping entry", result.pdb_code) else: dat_content = simbad.db._str_to_dat(content) with open(os.path.join(stem, result.pdb_code + ".dat"), "w") as f_out: f_out.write(dat_content) if simbad.db.is_valid_dat(os.path.join(stem, result.pdb_code + ".dat")): pass else: logger.debug("Unable to convert %s to dat file", result.pdb_code) if morda_installed_through_ccp4: for dat_file in morda_dat_files: if result.pdb_code.lower() == dat_file[0:4]: stem = os.path.join(database, result.uniprot_mnemonic, result.uniprot_name, result.space_group, "morda") if not os.path.exists(stem): os.makedirs(stem) code = dat_file.rsplit('.', 1)[0] final_file = os.path.join(stem, dat_file) tmp_d = tmp_dir(directory=os.getcwd()) get_model_output = os.path.join(tmp_d, code + ".pdb") script = make_script( [["export CCP4_SCR=", tmp_d], ["cd", tmp_d], [exe, "-c", code, "-m", "d"]], directory=tmp_d) log = script.rsplit('.', 1)[0] + '.log' what_to_do += [(script, log, tmp_d, (get_model_output, final_file))] if len(what_to_do) > 0: scripts, _, tmps, files = zip(*what_to_do) j = Job(submit_qtype) j.submit(scripts, name='cont_db', nproc=nproc, queue=submit_queue) j.wait() for output, final in files: if os.path.isfile(output): simbad.db.convert_pdb_to_dat(output, final) else: print "File missing: {}".format(output) for d in tmps: shutil.rmtree(d) for f in dimple_files: if os.path.isdir(f): shutil.rmtree(f) elif os.path.isfile(f): os.remove(f) validate_compressed_database(database)