Exemplo n.º 1
0
 def test_tmp_dir_6(self):
     tmp = tmp_dir(directory=os.getcwd(), prefix="first", suffix="last")
     self.assertTrue(os.path.isdir(tmp))
     path, name = os.path.split(tmp)
     self.assertEqual(os.getcwd(), path) 
     self.assertTrue(name.startswith("first"))
     self.assertTrue(name.endswith("last"))
     shutil.rmtree(tmp)
Exemplo n.º 2
0
 def test_tmp_dir_5(self):
     tmp = tmp_dir(suffix="simbad.the-last")
     self.assertTrue(os.path.isdir(tmp))
     _, name = os.path.split(tmp)
     self.assertTrue(name.endswith("simbad.the-last"))
     shutil.rmtree(tmp)
Exemplo n.º 3
0
 def test_tmp_dir_3(self):
     tmp = tmp_dir(prefix="first")
     self.assertTrue(os.path.isdir(tmp))
     _, name = os.path.split(tmp)
     self.assertTrue(name.startswith("first"))
     shutil.rmtree(tmp)
Exemplo n.º 4
0
 def test_tmp_dir_2(self):
     tmp = tmp_dir(directory=os.getcwd())
     self.assertTrue(os.path.isdir(tmp))
     path, _ = os.path.split(tmp)
     self.assertEqual(os.getcwd(), path)
     shutil.rmtree(tmp)
Exemplo n.º 5
0
 def test_tmp_dir_1(self):
     tmp = tmp_dir()
     self.assertTrue(os.path.isdir(tmp))
     shutil.rmtree(tmp)
Exemplo n.º 6
0
def create_morda_db(database, nproc=2, submit_qtype=None, submit_queue=False, chunk_size=5000):
    """Create the MoRDa search database

    Parameters
    ----------
    database : str
       The path to the database folder
    nproc : int, optional
       The number of processors [default: 2]
    submit_qtype : str
       The cluster submission queue type - currently support SGE and LSF
    submit_queue : str
       The queue to submit to on the cluster
    chunk_size : int, optional
       The number of jobs to submit at the same time [default: 5000]

    Raises
    ------
    RuntimeError
       Windows is currently not supported

    """
    if CUSTOM_PLATFORM == "windows":
        msg = "Windows is currently not supported"
        raise RuntimeError(msg)

    if not is_valid_db_location(database):
        raise RuntimeError("Permission denied! Cannot write to {}!".format(os.path.dirname(database)))

    if "MRD_DB" in os.environ:
        morda_installed_through_ccp4 = True
    else:
        download_morda()
        morda_installed_through_ccp4 = False

    morda_dat_path = os.path.join(os.environ['MRD_DB'], 'home', 'ca_DOM', '*.dat')
    #simbad_dat_path = os.path.join(database, '**', '*.dat')
    simbad_pdb_path = os.path.join(database, '**', '*.pdb')
    morda_dat_files = set([os.path.basename(f) for f in glob.glob(morda_dat_path)])
    # simbad_dat_files = set([os.path.basename(f) for f in glob.glob(simbad_dat_path)])
    simbad_dat_files = set([os.path.basename(f).split('.')[0] + '.dat' for f in glob.glob(simbad_pdb_path)])
    # erroneous_files = set(["1bbzA_0.dat", "1gt0D_0.dat", "1h3oA_0.dat", "1kskA_1.dat", "1l0sA_0.dat"])
    erroneous_files = set(["1bbzA_0.pdb", "1gt0D_0.pdb", "1h3oA_0.pdb", "1kskA_1.pdb", "1l0sA_0.pdb"])

    def delete_erroneous_files(erroneous_paths):
        for f in erroneous_paths:
            if os.path.isfile(f):
                logger.warning("File flagged to be erroneous ... " + "removing from database: %s", f)
                os.remove(f)

    erroneous_paths = [os.path.join(database, name[1:3], name) for name in erroneous_files]
    delete_erroneous_files(erroneous_paths)

    dat_files = list(morda_dat_files - simbad_dat_files - erroneous_files)
    if len(dat_files) < 1:
        logger.info('SIMBAD database up-to-date')
        if not morda_installed_through_ccp4:
            shutil.rmtree(os.environ["MRD_DB"])
        leave_timestamp(os.path.join(database, 'simbad_morda.txt'))
        return
    else:
        logger.info("%d new entries were found in the MoRDa database, " + "updating SIMBAD database", len(dat_files))

    exe = os.path.join(os.environ["MRD_PROG"], "get_model")

    run_dir = tmp_dir(directory=os.getcwd())

    # Submit in chunks, so we don't take too much disk space
    # and can terminate without loosing the processed data
    total_chunk_cycles = len(dat_files) // chunk_size + (len(dat_files) % 5 > 0)
    for cycle, i in enumerate(range(0, len(dat_files), chunk_size)):
        logger.info("Working on chunk %d out of %d", cycle + 1, total_chunk_cycles)
        chunk_dat_files = dat_files[i:i + chunk_size]

        # Create the database files
        what_to_do = []
        for f in chunk_dat_files:
            code = os.path.basename(f).rsplit('.', 1)[0]
            #final_file = os.path.join(database, code[1:3], code + ".dat")
            final_file = os.path.join(database, code[1:3], code + '.pdb')
            # We need a temporary directory within because "get_model" uses non-unique file names
            tmp_d = tmp_dir(directory=run_dir)
            get_model_output = os.path.join(tmp_d, code + ".pdb")
            script = make_script(
                [["export CCP4_SCR=", tmp_d], ["export MRD_DB=" + os.environ['MRD_DB']], ["cd", tmp_d],
                 [exe, "-c", code, "-m", "d"]],
                directory=tmp_d)
            log = script.rsplit('.', 1)[0] + '.log'
            what_to_do += [(script, log, tmp_d, (get_model_output, final_file))]

        scripts, _, tmps, files = zip(*what_to_do)
        j = Job(submit_qtype)
        j.submit(scripts, name='morda_db', nproc=nproc, queue=submit_queue)
        j.wait()

        sub_dir_names = set([os.path.basename(f).rsplit('.', 1)[0][1:3] for f in chunk_dat_files])
        for sub_dir_name in sub_dir_names:
            sub_dir = os.path.join(database, sub_dir_name)
            if os.path.isdir(sub_dir):
                continue
            os.makedirs(sub_dir)

        for output, final in files:
            if os.path.isfile(output):
                #simbad.db.convert_pdb_to_dat(output, final)
                shutil.move(output, final)
            else:
                logger.critical("File missing: {}".format(output))

        for d in tmps:
            shutil.rmtree(d)

    shutil.rmtree(run_dir)
    if not morda_installed_through_ccp4:
        shutil.rmtree(os.environ["MRD_DB"])

    validate_compressed_database(database)
    leave_timestamp(os.path.join(database, 'simbad_morda.txt'))
Exemplo n.º 7
0
def create_contaminant_db(database, add_morda_domains, nproc=2, submit_qtype=None, submit_queue=False):
    """Create a contaminant database

    Parameters
    ----------
    database : str
        The path to the database folder
    add_morda_domains : bool
        Retrospectively add morda domains to a contaminant database updated when morda was not installed
    nproc : int, optional
        The number of processors [default: 2]
    submit_qtype : str
        The cluster submission queue type - currently support SGE and LSF
    submit_queue : str
        The queue to submit to on the cluster

    Raises
    ------
    RuntimeError
        dimple.contaminants.prepare module not available
    RuntimeError
       Windows is currently not supported
    """
    if not is_valid_db_location(database):
        raise RuntimeError("Permission denied! Cannot write to {}!".format(os.path.dirname(database)))

    import dimple.main
    logger.info('DIMPLE version: %s', dimple.main.__version__)

    if StrictVersion(dimple.main.__version__) < StrictVersion('2.5.7'):
        msg = "This feature will be available with dimple version 2.5.7"
        raise RuntimeError(msg)

    if CUSTOM_PLATFORM == "windows":
        msg = "Windows is currently not supported"
        raise RuntimeError(msg)

    import dimple.contaminants.prepare

    dimple.contaminants.prepare.main(verbose=False)

    simbad_dat_path = os.path.join(database, '*', '*', '*', '*.dat')
    existing_dat_files = [os.path.basename(f).split('.')[0].lower() for f in glob.iglob(simbad_dat_path)]
    erroneous_files = ['4v43']
    dimple_files = ['cached', 'data.json', 'data.py']

    with open("data.json") as data_file:
        data = json.load(data_file)

    results = []
    for child in data["children"]:
        try:
            for child_2 in child["children"]:
                space_group = child_2["name"].replace(" ", "")
                for child_3 in child_2["children"]:
                    pdb_code = child_3["name"].split()[0].lower()
                    if (pdb_code in existing_dat_files or pdb_code in erroneous_files) and not add_morda_domains:
                        continue
                    uniprot_name = child["name"]
                    uniprot_mnemonic = uniprot_name.split('_')[1]
                    score = ContaminantSearchResult(pdb_code, space_group, uniprot_name, uniprot_mnemonic)
                    results.append(score)
        except KeyError:
            pass

    if len(results) == 0:
        logger.info("Contaminant database up to date")
    else:
        if add_morda_domains:
            logger.info("Adding morda domains to contaminant database")
        else:
            logger.info("%d new entries were found in the contaminant database, " + "updating SIMBAD database",
                        len(results))

        if "MRD_DB" in os.environ:
            morda_installed_through_ccp4 = True
        else:
            morda_installed_through_ccp4 = False

        if add_morda_domains and not morda_installed_through_ccp4:
            logger.critical("Morda not installed locally, unable to add morda domains to contaminant database")

        if morda_installed_through_ccp4:
            morda_dat_path = os.path.join(os.environ['MRD_DB'], 'home', 'ca_DOM', '*.dat')
            morda_dat_files = set([os.path.basename(f) for f in glob.iglob(morda_dat_path)])
            exe = os.path.join(os.environ['MRD_PROG'], "get_model")
        else:
            logger.info(
                "Morda not installed locally, therefore morda domains will not be added to contaminant database")

        what_to_do = []
        for result in results:
            stem = os.path.join(os.getcwd(), database, result.uniprot_mnemonic, result.uniprot_name, result.space_group)
            if not os.path.exists(stem):
                os.makedirs(stem)

            content = PdbStructure.get_pdb_content(result.pdb_code)
            if content is None:
                logger.debug("Encountered a problem downloading PDB %s - skipping entry", result.pdb_code)
            else:
                dat_content = simbad.db._str_to_dat(content)
                with open(os.path.join(stem, result.pdb_code + ".dat"), "w") as f_out:
                    f_out.write(dat_content)

                if simbad.db.is_valid_dat(os.path.join(stem, result.pdb_code + ".dat")):
                    pass
                else:
                    logger.debug("Unable to convert %s to dat file", result.pdb_code)

            if morda_installed_through_ccp4:
                for dat_file in morda_dat_files:
                    if result.pdb_code.lower() == dat_file[0:4]:
                        stem = os.path.join(database, result.uniprot_mnemonic, result.uniprot_name, result.space_group,
                                            "morda")
                        if not os.path.exists(stem):
                            os.makedirs(stem)
                        code = dat_file.rsplit('.', 1)[0]
                        final_file = os.path.join(stem, dat_file)
                        tmp_d = tmp_dir(directory=os.getcwd())
                        get_model_output = os.path.join(tmp_d, code + ".pdb")
                        script = make_script(
                            [["export CCP4_SCR=", tmp_d], ["cd", tmp_d], [exe, "-c", code, "-m", "d"]], directory=tmp_d)
                        log = script.rsplit('.', 1)[0] + '.log'
                        what_to_do += [(script, log, tmp_d, (get_model_output, final_file))]

        if len(what_to_do) > 0:
            scripts, _, tmps, files = zip(*what_to_do)
            j = Job(submit_qtype)
            j.submit(scripts, name='cont_db', nproc=nproc, queue=submit_queue)
            j.wait()

            for output, final in files:
                if os.path.isfile(output):
                    simbad.db.convert_pdb_to_dat(output, final)
                else:
                    print "File missing: {}".format(output)

            for d in tmps:
                shutil.rmtree(d)

            for f in dimple_files:
                if os.path.isdir(f):
                    shutil.rmtree(f)
                elif os.path.isfile(f):
                    os.remove(f)

    validate_compressed_database(database)