Пример #1
0
def build_models(process_only_these_targets=None,
                 process_only_these_templates=None,
                 model_seqid_cutoff=None,
                 write_modeller_restraints_file=False,
                 loglevel=None):
    """Uses the build_model method to build homology models for a given set of
    targets and templates.

    MPI-enabled.
    """
    # Note that this code uses an os.chdir call to switch into a temp directory before running Modeller.
    # This is because Modeller writes various output files in the current directory, and there is NO WAY
    # to define where these files are written, other than to chdir beforehand. If running this routine
    # in parallel, it is likely that occasional exceptions will occur, due to concurrent processes
    # making os.chdir calls.
    ensembler.utils.set_loglevel(loglevel)
    targets, templates_resolved_seq = get_targets_and_templates()

    if process_only_these_templates:
        selected_template_indices = [
            i for i, seq in enumerate(templates_resolved_seq)
            if seq.id in process_only_these_templates
        ]
    else:
        selected_template_indices = range(len(templates_resolved_seq))

    for target in targets:
        if process_only_these_targets and target.id not in process_only_these_targets:
            continue
        target_setup_data = build_models_target_setup(target)

        if model_seqid_cutoff:
            process_only_these_templates = ensembler.core.select_templates_by_seqid_cutoff(
                target.id, seqid_cutoff=model_seqid_cutoff)
            selected_template_indices = [
                i for i, seq in enumerate(templates_resolved_seq)
                if seq.id in process_only_these_templates
            ]

        ntemplates_selected = len(selected_template_indices)

        for template_index in range(mpistate.rank, ntemplates_selected,
                                    mpistate.size):
            template_resolved_seq = templates_resolved_seq[
                selected_template_indices[template_index]]
            if process_only_these_templates and template_resolved_seq.id not in process_only_these_templates:
                continue
            build_model(
                target,
                template_resolved_seq,
                target_setup_data,
                write_modeller_restraints_file=write_modeller_restraints_file,
                loglevel=loglevel)
        write_build_models_metadata(target, target_setup_data,
                                    process_only_these_targets,
                                    process_only_these_templates,
                                    model_seqid_cutoff,
                                    write_modeller_restraints_file)
Пример #2
0
def build_models(process_only_these_targets=None, process_only_these_templates=None,
                 model_seqid_cutoff=None, write_modeller_restraints_file=False, loglevel=None):
    """Uses the build_model method to build homology models for a given set of
    targets and templates.

    MPI-enabled.
    """
    # Note that this code uses an os.chdir call to switch into a temp directory before running Modeller.
    # This is because Modeller writes various output files in the current directory, and there is NO WAY
    # to define where these files are written, other than to chdir beforehand. If running this routine
    # in parallel, it is likely that occasional exceptions will occur, due to concurrent processes
    # making os.chdir calls.
    ensembler.utils.set_loglevel(loglevel)
    targets, templates_resolved_seq = get_targets_and_templates()

    if process_only_these_templates:
        selected_template_indices = [i for i, seq in enumerate(templates_resolved_seq) if seq.id in process_only_these_templates]
    else:
        selected_template_indices = range(len(templates_resolved_seq))

    for target in targets:
        if process_only_these_targets and target.id not in process_only_these_targets: continue
        target_setup_data = build_models_target_setup(target)

        if model_seqid_cutoff:
            process_only_these_templates = ensembler.core.select_templates_by_seqid_cutoff(target.id, seqid_cutoff=model_seqid_cutoff)
            selected_template_indices = [i for i, seq in enumerate(templates_resolved_seq) if seq.id in process_only_these_templates]

        ntemplates_selected = len(selected_template_indices)

        for template_index in range(mpistate.rank, ntemplates_selected, mpistate.size):
            template_resolved_seq = templates_resolved_seq[selected_template_indices[template_index]]
            if process_only_these_templates and template_resolved_seq.id not in process_only_these_templates: continue
            build_model(target, template_resolved_seq, target_setup_data,
                        write_modeller_restraints_file=write_modeller_restraints_file,
                        loglevel=loglevel)
        write_build_models_metadata(target, target_setup_data, process_only_these_targets,
                                    process_only_these_templates, model_seqid_cutoff,
                                    write_modeller_restraints_file)
Пример #3
0
def package_for_fah(process_only_these_targets=None,
                    process_only_these_templates=None,
                    model_seqid_cutoff=None,
                    model_validation_score_cutoff=None,
                    model_validation_score_percentile=None,
                    nclones=1,
                    archive=False,
                    openmm_platform='Reference',
                    temperature=300.0 * unit.kelvin,
                    collision_rate=1.0 / unit.picosecond,
                    timestep=2.0 * unit.femtoseconds,
                    loglevel=None):
    """
    Create the input files and directory structure necessary to start a Folding@Home project.

    MPI-enabled.

    Parameters
    ----------
    archive : Bool
        A .tgz compressed archive will be created for each individual RUN directory.
    """
    set_loglevel(loglevel)

    if mpistate.rank == 0:
        if not os.path.exists(fah_projects_dir):
            os.mkdir(fah_projects_dir)
    mpistate.comm.Barrier()

    targets, templates_resolved_seq = get_targets_and_templates()

    for target in targets:
        if process_only_these_targets and (target.id
                                           not in process_only_these_targets):
            continue

        target_project_dir = os.path.join(fah_projects_dir, target.id)

        models_target_dir = os.path.join(default_project_dirnames.models,
                                         target.id)
        if not os.path.exists(models_target_dir):
            continue

        mpistate.comm.Barrier()

        sorted_valid_templates = []
        system = None
        renumbered_resnums = {}

        if mpistate.rank == 0:
            logger.info(
                '-------------------------------------------------------------------------'
            )
            logger.info('Building FAH OpenMM project for target {}'.format(
                target.id))
            logger.info(
                '-------------------------------------------------------------------------'
            )

            valid_templates = get_valid_templates_for_target(
                target,
                templates_resolved_seq,
                process_only_these_templates=process_only_these_templates,
                model_seqid_cutoff=model_seqid_cutoff,
                model_validation_score_cutoff=model_validation_score_cutoff,
                model_validation_score_percentile=
                model_validation_score_percentile)

            sorted_valid_templates = sort_valid_templates_by_seqid(
                target, valid_templates)

            create_target_project_dir(target)

            system = setup_system_and_integrator_files(
                target, sorted_valid_templates[0], temperature, collision_rate,
                timestep)

            renumbered_resnums = get_renumbered_topol_resnums(target)

        sorted_valid_templates = mpistate.comm.bcast(sorted_valid_templates,
                                                     root=0)
        system = mpistate.comm.bcast(system, root=0)
        renumbered_resnums = mpistate.comm.bcast(renumbered_resnums, root=0)

        logger.debug("Building RUNs in parallel...")

        for run_index in range(mpistate.rank, len(sorted_valid_templates),
                               mpistate.size):
            template = sorted_valid_templates[run_index]

            logger.info(
                '-------------------------------------------------------------------------'
            )
            logger.info('Building RUN{} for template {}'.format(
                run_index, template))
            logger.info(
                '-------------------------------------------------------------------------'
            )

            source_dir = os.path.join(models_target_dir, template)
            generate_fah_run(
                target_project_dir,
                template,
                source_dir,
                system,
                run_index,
                nclones,
                temperature,
                collision_rate,
                timestep,
                openmm_platform,
                renumbered_resnums,
            )

            if archive:
                tgz_fah_run(target, run_index)

    mpistate.comm.Barrier()
    if mpistate.rank == 0:
        logger.info('Done.')
Пример #4
0
def cluster_models(process_only_these_targets=None, cutoff=0.06, loglevel=None):
    """Cluster models based on RMSD, and filter out non-unique models as
    determined by a given cutoff.

    Parameters
    ----------

    cutoff : float
        Minimum distance cutoff for RMSD clustering (nm)

    Runs serially.
    """
    # TODO refactor
    ensembler.utils.set_loglevel(loglevel)
    targets, templates_resolved_seq = get_targets_and_templates()
    templates = templates_resolved_seq

    for target in targets:
        if process_only_these_targets and (target.id not in process_only_these_targets): continue

        models_target_dir = os.path.join(ensembler.core.default_project_dirnames.models, target.id)
        if not os.path.exists(models_target_dir): continue

        # =============================
        # Construct a mdtraj trajectory containing all models
        # =============================

        starttime = datetime.datetime.utcnow()

        logger.debug('Building a list of valid models...')

        model_pdbfilenames_compressed = {
            template.id: os.path.join(models_target_dir, template.id, 'model.pdb.gz') for template in templates
        }
        model_pdbfilenames_uncompressed = {
            template.id: os.path.join(models_target_dir, template.id, 'model.pdb') for template in templates
        }
        valid_templateids = [
            templateid for templateid in model_pdbfilenames_compressed
            if os.path.exists(model_pdbfilenames_compressed[templateid])
        ]

        # Write uncompressed model.pdb files from model.pdb.gz if necessary
        for templateid in valid_templateids:
            if not os.path.exists(model_pdbfilenames_uncompressed[templateid]) or os.path.getsize(model_pdbfilenames_uncompressed[templateid]) == 0:
                with gzip.open(model_pdbfilenames_compressed[templateid]) as model_pdbfile_compressed:
                    with open(model_pdbfilenames_uncompressed[templateid], 'w') as model_pdbfile:
                        model_pdbfile.write(model_pdbfile_compressed.read())

        logger.info('Constructing a trajectory containing all valid models...')

        if len(valid_templateids) == 0:
            logger.info('No models found for target {0}.'.format(target.id))
            continue

        valid_model_pdbfilenames_uncompressed = [
            model_pdbfilenames_uncompressed[templateid] for templateid in valid_templateids
        ]

        traj = mdtraj.load(valid_model_pdbfilenames_uncompressed)

        # =============================
        # Clustering
        # =============================

        logger.info('Conducting RMSD-based clustering...')

        # Remove any existing unique_by_clustering files
        for f in glob.glob(models_target_dir+'/*_PK_*/unique_by_clustering'):
            os.unlink(f)

        CAatoms = [a.index for a in traj.topology.atoms if a.name == 'CA']
        unique_templateids = models_regular_spatial_clustering(
            valid_templateids, traj, atom_indices=CAatoms, cutoff=cutoff
        )
        write_unique_by_clustering_files(unique_templateids, models_target_dir)

        with open(os.path.join(models_target_dir, 'unique-models.txt'), 'w') as uniques_file:
            for u in unique_templateids:
                uniques_file.write(u+'\n')
            logger.info(
                '%d unique models (from original set of %d) using cutoff of %.3f nm' %
                        (len(unique_templateids), len(valid_templateids), cutoff)
            )

        for template in templates:
            model_dir = os.path.join(models_target_dir, template.id)
            model_pdbfilename = os.path.join(model_dir, 'model.pdb')
            if os.path.exists(model_pdbfilename):
                os.remove(model_pdbfilename)

        # ========
        # Metadata
        # ========

        project_metadata = ensembler.core.ProjectMetadata(
            project_stage='cluster_models', target_id=target.id
        )
        datestamp = ensembler.core.get_utcnow_formatted()

        timedelta = datetime.datetime.utcnow() - starttime

        metadata = {
            'target_id': target.id,
            'datestamp': datestamp,
            'nunique_models': len(unique_templateids),
            'python_version': sys.version.split('|')[0].strip(),
            'python_full_version': ensembler.core.literal_str(sys.version),
            'ensembler_version': ensembler.version.short_version,
            'ensembler_commit': ensembler.version.git_revision,
            'biopython_version': Bio.__version__,
            'mdtraj_version': mdtraj.version.short_version,
            'mdtraj_commit': mdtraj.version.git_revision,
            'timing': ensembler.core.strf_timedelta(timedelta),
        }

        project_metadata.add_data(metadata)
        project_metadata.write()
Пример #5
0
def package_for_fah(process_only_these_targets=None,
                    process_only_these_templates=None,
                    model_seqid_cutoff=None,
                    model_validation_score_cutoff=None,
                    model_validation_score_percentile=None,
                    nclones=1, archive=False,
                    openmm_platform='Reference',
                    temperature=300.0 * unit.kelvin,
                    collision_rate=1.0 / unit.picosecond,
                    timestep=2.0 * unit.femtoseconds,
                    loglevel=None):
    """
    Create the input files and directory structure necessary to start a Folding@Home project.

    MPI-enabled.

    Parameters
    ----------
    archive : Bool
        A .tgz compressed archive will be created for each individual RUN directory.
    """
    set_loglevel(loglevel)

    if mpistate.rank == 0:
        if not os.path.exists(fah_projects_dir):
            os.mkdir(fah_projects_dir)
    mpistate.comm.Barrier()

    targets, templates_resolved_seq = get_targets_and_templates()

    for target in targets:
        if process_only_these_targets and (target.id not in process_only_these_targets):
            continue

        target_project_dir = os.path.join(fah_projects_dir, target.id)

        models_target_dir = os.path.join(default_project_dirnames.models, target.id)
        if not os.path.exists(models_target_dir):
            continue

        mpistate.comm.Barrier()

        sorted_valid_templates = []
        system = None
        renumbered_resnums = {}

        if mpistate.rank == 0:
            logger.info('-------------------------------------------------------------------------')
            logger.info('Building FAH OpenMM project for target {}'.format(target.id))
            logger.info('-------------------------------------------------------------------------')

            valid_templates = get_valid_templates_for_target(
                target,
                templates_resolved_seq,
                process_only_these_templates=process_only_these_templates,
                model_seqid_cutoff=model_seqid_cutoff,
                model_validation_score_cutoff=model_validation_score_cutoff,
                model_validation_score_percentile=model_validation_score_percentile
            )

            sorted_valid_templates = sort_valid_templates_by_seqid(
                target,
                valid_templates
            )

            create_target_project_dir(target)

            system = setup_system_and_integrator_files(
                target,
                sorted_valid_templates[0],
                temperature,
                collision_rate,
                timestep
            )

            renumbered_resnums = get_renumbered_topol_resnums(target)

        sorted_valid_templates = mpistate.comm.bcast(sorted_valid_templates, root=0)
        system = mpistate.comm.bcast(system, root=0)
        renumbered_resnums = mpistate.comm.bcast(renumbered_resnums, root=0)

        logger.debug("Building RUNs in parallel...")

        for run_index in range(mpistate.rank, len(sorted_valid_templates), mpistate.size):
            template = sorted_valid_templates[run_index]

            logger.info('-------------------------------------------------------------------------')
            logger.info(
                'Building RUN{} for template {}'.format(
                    run_index, template
                )
            )
            logger.info('-------------------------------------------------------------------------')

            source_dir = os.path.join(models_target_dir, template)
            generate_fah_run(
                target_project_dir,
                template,
                source_dir,
                system,
                run_index,
                nclones,
                temperature,
                collision_rate,
                timestep,
                openmm_platform,
                renumbered_resnums,
            )

            if archive:
                tgz_fah_run(target, run_index)

    mpistate.comm.Barrier()
    if mpistate.rank == 0:
        logger.info('Done.')
Пример #6
0
def cluster_models(process_only_these_targets=None,
                   cutoff=0.06,
                   loglevel=None):
    """Cluster models based on RMSD, and filter out non-unique models as
    determined by a given cutoff.

    Parameters
    ----------

    cutoff : float
        Minimum distance cutoff for RMSD clustering (nm)

    Runs serially.
    """
    # TODO refactor
    ensembler.utils.set_loglevel(loglevel)
    targets, templates_resolved_seq = get_targets_and_templates()
    templates = templates_resolved_seq

    for target in targets:
        if process_only_these_targets and (target.id
                                           not in process_only_these_targets):
            continue

        models_target_dir = os.path.join(
            ensembler.core.default_project_dirnames.models, target.id)
        if not os.path.exists(models_target_dir): continue

        # =============================
        # Construct a mdtraj trajectory containing all models
        # =============================

        starttime = datetime.datetime.utcnow()

        logger.debug('Building a list of valid models...')

        model_pdbfilenames_compressed = {
            template.id: os.path.join(models_target_dir, template.id,
                                      'model.pdb.gz')
            for template in templates
        }
        model_pdbfilenames_uncompressed = {
            template.id: os.path.join(models_target_dir, template.id,
                                      'model.pdb')
            for template in templates
        }
        valid_templateids = [
            templateid for templateid in model_pdbfilenames_compressed
            if os.path.exists(model_pdbfilenames_compressed[templateid])
        ]

        # Write uncompressed model.pdb files from model.pdb.gz if necessary
        for templateid in valid_templateids:
            if not os.path.exists(
                    model_pdbfilenames_uncompressed[templateid]
            ) or os.path.getsize(
                    model_pdbfilenames_uncompressed[templateid]) == 0:
                with gzip.open(model_pdbfilenames_compressed[templateid]
                               ) as model_pdbfile_compressed:
                    with open(model_pdbfilenames_uncompressed[templateid],
                              'w') as model_pdbfile:
                        model_pdbfile.write(model_pdbfile_compressed.read())

        logger.info('Constructing a trajectory containing all valid models...')

        if len(valid_templateids) == 0:
            logger.info('No models found for target {0}.'.format(target.id))
            continue

        valid_model_pdbfilenames_uncompressed = [
            model_pdbfilenames_uncompressed[templateid]
            for templateid in valid_templateids
        ]

        traj = mdtraj.load(valid_model_pdbfilenames_uncompressed)

        # =============================
        # Clustering
        # =============================

        logger.info('Conducting RMSD-based clustering...')

        # Remove any existing unique_by_clustering files
        for f in glob.glob(models_target_dir + '/*_PK_*/unique_by_clustering'):
            os.unlink(f)

        CAatoms = [a.index for a in traj.topology.atoms if a.name == 'CA']
        unique_templateids = models_regular_spatial_clustering(
            valid_templateids, traj, atom_indices=CAatoms, cutoff=cutoff)
        write_unique_by_clustering_files(unique_templateids, models_target_dir)

        with open(os.path.join(models_target_dir, 'unique-models.txt'),
                  'w') as uniques_file:
            for u in unique_templateids:
                uniques_file.write(u + '\n')
            logger.info(
                '%d unique models (from original set of %d) using cutoff of %.3f nm'
                % (len(unique_templateids), len(valid_templateids), cutoff))

        for template in templates:
            model_dir = os.path.join(models_target_dir, template.id)
            model_pdbfilename = os.path.join(model_dir, 'model.pdb')
            if os.path.exists(model_pdbfilename):
                os.remove(model_pdbfilename)

        # ========
        # Metadata
        # ========

        project_metadata = ensembler.core.ProjectMetadata(
            project_stage='cluster_models', target_id=target.id)
        datestamp = ensembler.core.get_utcnow_formatted()

        timedelta = datetime.datetime.utcnow() - starttime

        metadata = {
            'target_id': target.id,
            'datestamp': datestamp,
            'nunique_models': len(unique_templateids),
            'python_version': sys.version.split('|')[0].strip(),
            'python_full_version': ensembler.core.literal_str(sys.version),
            'ensembler_version': ensembler.version.short_version,
            'ensembler_commit': ensembler.version.git_revision,
            'biopython_version': Bio.__version__,
            'mdtraj_version': mdtraj.version.short_version,
            'mdtraj_commit': mdtraj.version.git_revision,
            'timing': ensembler.core.strf_timedelta(timedelta),
        }

        project_metadata.add_data(metadata)
        project_metadata.write()