Пример #1
0
def test_single_file(tmpdir, pdb, options):
    """Basic regression test using propka.run.single and local file for the
    input PDB file"""
    ref_path, pdb_path = get_paths(pdb)
    filename = str(pdb_path)

    with tmpdir.as_cwd():
        pkrun.single(filename, options)
        compare_output(pdb, Path.cwd(), ref_path)
Пример #2
0
def test_single_nopka(tmpdir):
    """Basic test to check that the pKa file is not written when write_pka is
    `False`"""
    pdb = "1FTJ-Chain-A"
    ref_path, pdb_path = get_paths(pdb)
    filename = f"{pdb}.pdb"

    with open(pdb_path, 'r') as writer:
        filestream = StringIO(writer.read())

    pkrun.single(filename, stream=filestream, write_pka=False)
    assert not os.path.isfile(f"{pdb}.pka")
Пример #3
0
def test_single_extra_files_logwarn(tmpdir, caplog):
    """Tests that a logging warning is thrown if passing files via optargs"""
    pdb = "1FTJ-Chain-A"
    options = ('-f foo.pdb bar.pdb', '-f test.pdb test2.pdb')
    ref_path, pdb_path = get_paths(pdb)
    filename = str(pdb_path)

    with tmpdir.as_cwd():
        pkrun.single(filename, options)

        wmsg = ("Ignoring extra filenames passed: [' foo.pdb bar.pdb', "
                "' test.pdb test2.pdb']")
        assert wmsg in caplog.records[0].message
Пример #4
0
def test_single_filestream(tmpdir, pdb, options):
    """Basic regression test using StringIO streams for the input PDB file"""
    ref_path, pdb_path = get_paths(pdb)
    filename = f"{pdb}.pdb"

    with open(pdb_path, 'r') as writer:
        filestream = StringIO(writer.read())

    with tmpdir.as_cwd():
        pkrun.single(filename, options, stream=filestream)
        compare_output(pdb, Path.cwd(), ref_path)

    filestream.close()
Пример #5
0
def test_single_propka_input(tmpdir):
    """Basic test to check that the propka_input file is written when
    `--generate-propka-input` is passed"""
    pdb = "1FTJ-Chain-A"
    options = ('--generate-propka-input', )
    ref_path, pdb_path = get_paths(pdb)
    filename = f"{pdb}.pdb"

    with open(pdb_path, 'r') as writer:
        filestream = StringIO(writer.read())

    with tmpdir.as_cwd():
        pkrun.single(filename, options, stream=filestream)
        assert os.path.isfile(f"{pdb}.propka_input")
Пример #6
0
    def _single_frame(self):
        pstream = mda.lib.util.NamedStream(StringIO(), self.tmpfile)
        self.ag.write(pstream)
        # reset stream for reading
        pstream.reset()

        try:
            # TODO: it would be nice to allow for other options, maybe for 3.2?
            mol = pk.single(pstream, optargs=['--quiet'])
        except (IndexError, AttributeError) as err:
            errmsg = "failure on frame: {0}".format(self._ts.frame)
            if not self.skip_failure:
                raise_from(RuntimeError(errmsg), err)
            else:
                warnings.warn(errmsg)
                self.num_failed_frames += 1
                self.failed_frames_log.append(self._ts.frame)
                self.failed_times.append(self._ts.time)
        else:
            confname = mol.conformation_names[0]
            conformation = mol.conformations[confname]
            groups = conformation.get_titratable_groups()

            # extract pka estimates from each residue
            self._pkas.append([g.pka_value for g in groups])
            if self._columns is None:
                self._columns = [g.atom.resNumb for g in groups]
        finally:
            # deallocate stream
            pstream.close(force=True)
def get_pka_dict(pdb_fp):
    protein = run.single(pdb_fp, write_pka = False)
    pka_string = protein.write_pka()
    
    pka_string = pka_string.strip()
    
    #Split into entries.
    pka_entries = pka_string.split("\n")
    
    #Remove headers
    pka_entries = pka_entries[3:]
    
    #Remove extraneous whitespace and split into columns
    for i in range(len(pka_entries)):
        entry = pka_entries[i]
        entry = entry.strip()
        entry = entry.replace("\t", " ")
        while "  " in entry:
            entry = entry.replace("  ", " ")
        pka_entries[i] = entry.split(" ")
    
    #Filter out terminal pka's
    def entry_not_terminal(entry):
        if entry[0] == "N+":
            return False
        if entry[0] == "C-":
            return False
        return True
    
    pka_entries = list(filter(entry_not_terminal, pka_entries))
    pka_dict = dict()
    for entry in pka_entries:
        residue_position = int(entry[1])
        group = entry[2]
        pka = entry[3]
        if group not in pka_dict:
            pka_dict[group] = dict()
        if residue_position in pka_dict[group]:
            assert(False)
        pka_dict[group][residue_position] = float(pka)
    print(pka_dict)
    return pka_dict
    
Пример #8
0
def get_propka(universe,
               sel='protein',
               start=None,
               stop=None,
               step=None,
               skip_failure=False):
    """Get and store pKas for titrateable residues near the binding site.

    Parameters
    ----------
    universe : :class:`MDAnalysis.Universe`
        Universe to obtain pKas for.
    sel : str, array_like
        Selection string to use for selecting atoms to use from given
        ``universe``. Can also be a numpy array or list of atom indices to use.
    start : int
        Frame of trajectory to start from. `None` means start from beginning.
    stop : int
        Frame of trajectory to end at. `None` means end at trajectory end.
    step : int
        Step by which to iterate through trajectory frames. propka is slow,
        so set according to how finely you need resulting timeseries.
    skip_failure : bool
        If set to ``True``, skip frames where PROPKA fails. If ``False``
        raise an exception. The default is ``False``.
        Log file (at level warning) contains information on failed frames.

    Results
    -------
    pkas : :class:`pandas.DataFrame`
        DataFrame giving estimated pKa value for each residue for each
        trajectory frame. Residue numbers are given as column labels, times as
        row labels.

    """

    # need AtomGroup to write out for propka
    if isinstance(sel, string_types):
        atomsel = universe.select_atoms(sel)
    elif isinstance(sel, (list, np.array)):
        atomsel = universe.atoms[sel]

    # "filename" for our stream
    # use same name so that propka overwrites
    newname = os.path.join(os.path.dirname(universe.filename), 'current.pdb')

    # progress logging output (because this is slow...)
    pm = mda.lib.log.ProgressMeter(
        universe.trajectory.n_frames,
        format="{step:5d}/{numsteps} t={time:12.3f} ps  "
        "[{percentage:5.1f}%]",
        interval=1)

    times = []
    pkas = []
    failed_frames = 0
    failed_frames_log = []
    for ts in universe.trajectory[start:stop:step]:
        pm.echo(ts.frame, time=ts.time)

        # we create a named stream to write the atoms of interest into
        pstream = mda.lib.util.NamedStream(StringIO(), newname)
        atomsel.write(pstream)

        pstream.reset()  # reset for reading

        # we feed the stream to propka, and it reads it as if it were a file on
        # disk
        try:
            mol = pk.single(pstream, optargs=['--quiet'])
        except (IndexError, AttributeError) as err:
            #https://github.com/Becksteinlab/propkatraj/issues/13
            #https://github.com/Becksteinlab/propkatraj/issues/10
            if not skip_failure:
                raise
            else:
                err_msg = "{0} (failure {2}): failing frame {1}".format(
                    universe.trajectory.filename, ts.frame, failed_frames)
                failed_frames += 1
                failed_frames_log.append(ts.frame)
                logging.warning(err_msg)
                continue
        finally:
            pstream.close(force=True)  # deallocate

        # parse propka data structures to get out what we actually want
        confname = mol.conformation_names[0]
        conformation = mol.conformations[confname]
        groups = conformation.get_titratable_groups()

        # extract pka estimates from each residue
        pkas.append([g.pka_value for g in groups])

        # record time
        times.append(ts.time)

    if failed_frames_log:
        logging.warning('number of failed frames = {0}'.format(failed_frames))
        logging.warning('percent failure = {0:.3f}%'.format(
            float(failed_frames) / len(universe.trajectory) * 100))
        logging.warning('failed frames: %r', failed_frames_log)

    # a `pandas.DataFrame` is a good data structure for this data
    df = pd.DataFrame(pkas,
                      index=pd.Float64Index(times, name='time'),
                      columns=[g.atom.resNumb for g in groups])

    return df
Пример #9
0
    def evaluate_pkas(self, protein):
        #First we transform protein to Standard
        protein.relabel(format="Standard")

        protein.write_pdb("_propka_inp.pdb")

        #Then we call propka from the import

        if not hasattr(run, "single"):
            logger.error("Propka not properly installed, or wrong version")
            raise exceptions.Propka_Error

        try:
            my_molecule = run.single("_propka_inp.pdb")

        except:
            logger.error("Error running propka")
            raise exceptions.Propka_Error

        if os.path.isfile("_propka_inp.propka_input"):
            logger.debug("Removing file: _propka_inp.propka_input")
            os.remove("_propka_inp.propka_input")

        logger.info("[propka]           ==>> SUCCESS")

        #And run MSMS to generate the SAS if called for
        if self._buried_cutoff == "sas":
            try:
                pdb_to_xyzrn_cmd = config["PATHS"][
                    "MSMS_DIR"] + 'pdb_to_xyzrn _propka_inp.pdb > _msms_inp.xyzrn'
                msms_cmd = config["PATHS"][
                    "MSMS_DIR"] + 'msms.x86_64Linux2.2.6.1 -if _msms_inp.xyzrn -af _msms_out'
                os.system(pdb_to_xyzrn_cmd)
                os.system(msms_cmd)
            except:
                logger.error("Error running MSMS")
                raise exceptions.MSMS_Error

        #Now we move onto davids actual script for evaluation of the protons and what not

        #SAVE THE DATA
        if not os.path.isdir("save"):
            os.mkdir("save")

        if os.path.isfile("_propka_inp.pka"):
            shutil.copy("_propka_inp.pka", f"save/{self._step}.pka")

        if os.path.isfile("inConstr") and self._step > 0:
            shutil.copy("inConstr", f"save/{self._step-1}.inConstr")

        if os.path.isfile("_msms_out.area"):
            shutil.copy("_msms_out.area", f"save/{self._step}.area")

        self._step += 1

        with open("_propka_inp.pdb", 'r') as in_pdb:
            #Get all of the titratable residues as a list
            titratable_residues = montecarlo.process_pdb(in_pdb.readlines())

        ## REMOVE THE RESIDUES HERE THAT ARE STATIC

        remove_residues = []
        logger.debug("Removing static protonation state residues...")
        for i, titratable_residue in enumerate(titratable_residues):
            residue = protein.get_residue(
                [titratable_residue.chain,
                 int(titratable_residue.res_num)])
            for default_state in self._default_protonation_states:
                default_state_residue = protein.get_residue(default_state)
                if residue == default_state_residue:
                    remove_residues.append(i)

        remove_residues.reverse()
        [titratable_residues.pop(i) for i in remove_residues]

        #Define connections between residues
        montecarlo.define_connections(titratable_residues,
                                      PROTON_PARTNER_CUTOFF)

        if titratable_residues[0].chain:
            chains = True

        else:
            chains = False

        #propka output, titratable residues, and if we have multiple chains...
        calc_pKa_data = montecarlo.calc_pKa_total_pdb("_propka_inp.pka",
                                                      titratable_residues,
                                                      chains)
        for res in titratable_residues:
            res.assign_pKa(calc_pKa_data)

        solv_data = montecarlo.find_solv_shell("_propka_inp.pka", chains)

        if self._buried_cutoff == "sas":
            msms_data = montecarlo.store_sas_area("_msms_out.area", chains)

        titr_stack = [
        ]  # Construct the stack form of all_titr_res for use in find_solv_shell
        for res in titratable_residues:
            titr_stack += [res]

        #Need to make a copy so that we don't accidently screw up out list
        #Should check this...i think we want a copy of a list, but it pointing to the same res in all_titr_res
        all_networks = montecarlo.define_aa_networks(titr_stack)
        if self._buried_cutoff == "sas":
            all_networks = montecarlo.find_network_solvent_access(
                all_networks, msms_data, self._buried_cutoff,
                self._partner_dist)
        else:
            all_networks = montecarlo.find_network_solvent_access(
                all_networks, solv_data, self._buried_cutoff,
                self._partner_dist)

        #Now we do monte carlo
        montecarlo.MC_prot_change(all_networks, self._pH)
        for residue in titratable_residues:
            residue.update_prots()

        protonation_changes = []

        remove = []
        switch_his = []

        for residue in titratable_residues:
            if residue.change[0] != "None":
                change = []
                if residue.change[0] == "Add":
                    protonate = True

                elif residue.change[0] == "Remove":
                    protonate = False

                else:
                    logger.warn("Unknown residue change command")
                    continue

                change.extend([residue.chain, int(residue.res_num)])

                # We check to see if the residue is static in regard to the protonation state...
                # change_residue = protein.get_residue(change)
                # ignore = False
                # for default_state in self._default_protonation_states:
                # default_state_residue = protein.get_residue(default_state)
                # if change_residue == default_state_residue:
                # logger.warn(f"Cannot change protonation state of static residue {default_state_residue}")
                # ignore = True

                # if ignore:
                # continue

                change.append("protonate" if protonate else "deprotonate")

                #Use -1 and -2 to deprotonate the C and N Terminus
                if residue.ter_name == 'N+' and not protonate:
                    #check if we are deprotonating the n-terminus
                    change.append(-1)

                elif residue.ter_name == 'C-' and protonate:
                    #change.append(-2)
                    logger.warn("Tried to protonate the c-terminus")
                    logger.warn(
                        "This has been turned off permanently due to DMD issues"
                    )
                    continue

                else:
                    if protonate:
                        if residue.amino_acid.upper(
                        ) not in constants.PROTONATED_STANDARD.keys():
                            logger.debug(
                                "Cannot protonate residue {residue.amino_acid}"
                            )
                            remove.append(change[:2])
                            continue

                        elif residue.amino_acid.upper(
                        ) == "HIS" and residue.change_heteroatom[0] == "ND1":
                            switch_his.append(change[:2])
                            continue

                        for index, pairs in enumerate(
                                constants.PROTONATED_STANDARD[
                                    residue.amino_acid.upper()]):
                            if residue.change_heteroatom[0] == pairs[0]:
                                change.append(index + 1)
                                break

                    else:
                        if residue.amino_acid.upper(
                        ) not in constants.DEPROTONATED_STANDARD.keys():
                            logger.debug(
                                "Cannot deprotonate residue {residue.amino_acid}"
                            )
                            remove.append(change[:2])
                            continue

                        elif residue.amino_acid.upper(
                        ) == "HIS" and residue.change_heteroatom[0] == "NE2":
                            switch_his.append(change[:2])
                            continue

                        for index, pairs in enumerate(
                                constants.DEPROTONATED_STANDARD[
                                    residue.amino_acid.upper()]):
                            if residue.change_heteroatom[0] == pairs[0]:
                                change.append(index + 1)
                                break

                    if len(change) != 4:
                        logger.warn(
                            f"Cannot change protonation state of atom {residue.change_heteroatom[0]} in res {residue.amino_acid} {residue.res_num}"
                        )
                        continue

                protonation_changes.append(change)

        for change in protonation_changes:
            residue = protein.get_residue(change[:2])
            for i, current in enumerate(self._updated_protonation):
                current_residue = protein.get_residue(current[:2])
                if residue == current_residue:
                    self._updated_protonation[i] = change
                    #current = change
                    if residue.name.upper() == "HIS":
                        if change[2] == "protonate":
                            pass

                        else:
                            self._updated_protonation.append(
                                [change[0], change[1], "protonate"])

                    break

            else:
                if residue.name.upper() == "HIS":
                    if change[2] == "protonate":
                        pass

                    else:
                        self._updated_protonation.append(
                            [change[0], change[1], "protonate"])

                self._updated_protonation.append(change)

        if switch_his + remove:
            for switch in switch_his + remove:
                switch_res = protein.get_residue(switch[:2])
                remove_index = []
                for i in range(len(self._updated_protonation)):
                    residue = protein.get_residue(
                        self._updated_protonation[i][:2])
                    if residue == switch_res:
                        remove_index.append(i)

                remove_index.reverse()
                [self._updated_protonation.pop(i) for i in remove_index]

        # Assign protonations to self._updated_protonation
        # List -> Tuple -> Set -> List to get rid of duplicates
        self._updated_protonation = [
            list(item) for item in set(
                tuple(row) for row in self._updated_protonation)
        ]
        for state in self._updated_protonation:
            residue = protein.get_residue(state[:2])
            if residue.name.upper() == "HIS" and state[2] == "deprotonate":
                for other_state in self._updated_protonation:
                    other_residue = protein.get_residue(other_state[:2])
                    if residue == other_residue and other_state[
                            2] == "protonate":
                        break

                else:
                    print("BRUHHHH!!!!!!!")
                    raise exceptions.Propka_Error

        self._history.append(self._updated_protonation.copy())
        return self._updated_protonation
Пример #10
0
def get_propka(universe,
               sel='protein',
               start=None,
               stop=None,
               step=None,
               skip_failure=False):
    """Get and store pKas for titrateable residues along trajectory.

    Parameters
    ----------
    universe : :class:`MDAnalysis.Universe`
        Universe to obtain pKas for.
    sel : str, array_like
        Selection string to use for selecting atoms to use from given
        ``universe``. Can also be a numpy array or list of atom indices to use.
    start : int
        Frame of trajectory to start from. `None` means start from beginning.
    stop : int
        Frame of trajectory to end at. `None` means end at trajectory end.
    step : int
        Step by which to iterate through trajectory frames. propka is slow,
        so set according to how finely you need resulting timeseries.
    skip_failure : bool
        If set to ``True``, skip frames where PROPKA fails. If ``False``
        raise an exception. The default is ``False``.
        Log file (at level warning) contains information on failed frames.


    Results
    -------
    pkas : :class:`pandas.DataFrame`
        DataFrame giving estimated pKa value for each residue for each
        trajectory frame. Residue numbers are given as column labels, times as
        row labels.


    Notes
    -----
    Currently, temporary :program:`propka` files are written in the same
    directory as the input trajectory file. This will leave a ``current.pka``
    and ``current.propka_input`` file post-analysis. These are the temporary
    files for the final frame and can be removed. Should the trajectory file
    not have an input directory (e.g. when using MDAnalysis' `fetch_mmtf`
    method), then the files will be written to the current directory.

    Known issues:

    1. Due to the current behaviour of the MDAnalysis PDBWriter, non-protein
       atoms are written to PDBs using `ATOM` records instead of `HETATM`.
       This is likely to lead to undefined behaviour in :program:`propka`,
       which will likely expect `HETATM` inputs. We recommend users to only
       pass protein atoms for now. See the following issue for more details:
       https://github.com/Becksteinlab/propkatraj/issues/24

    """

    # need AtomGroup to write out for propka
    if isinstance(sel, string_types):
        atomsel = universe.select_atoms(sel)
    elif isinstance(sel, (list, np.ndarray)):
        atomsel = universe.atoms[sel]

    # Issue #23 (keep until the PDBWriter is fixed)
    if len(atomsel.select_atoms('not protein')) > 0:
        wmsg = ("Non protein atoms passed to propka 3.1.\n MDAnalysis' "
                "PDBWriter does not currently write non-standard residues "
                "correctly as HETATM records and this may lead to "
                "incorrect pKa predictions.\n"
                "See https://github.com/Becksteinlab/propkatraj/issues/24 "
                " for more details")
        warnings.warn(wmsg)

    # "filename" for our stream
    # use same name so that propka overwrites
    try:
        newname = os.path.join(os.path.dirname(universe.filename),
                               'current.pdb')
    except TypeError:
        # we have a trajectory without a directory
        newname = os.path.join(os.path.curdir, 'current.pdb')

    # progress logging output (because this is slow...)
    pm = mda.lib.log.ProgressMeter(
        universe.trajectory.n_frames,
        format="{step:5d}/{numsteps} t={time:12.3f} ps  "
        "[{percentage:5.1f}%]",
        interval=1)

    times = []
    pkas = []
    failed_frames = 0
    failed_frames_log = []
    for ts in universe.trajectory[start:stop:step]:
        pm.echo(ts.frame, time=ts.time)

        # we create a named stream to write the atoms of interest into
        pstream = mda.lib.util.NamedStream(StringIO(), newname)
        atomsel.write(pstream)

        pstream.reset()  # reset for reading

        # we feed the stream to propka, and it reads it as if it were a file on
        # disk
        try:
            mol = pk.single(pstream, optargs=['--quiet'])
        except (IndexError, AttributeError) as err:
            # https://github.com/Becksteinlab/propkatraj/issues/13
            # https://github.com/Becksteinlab/propkatraj/issues/10
            err_msg = "{0} (failure {2}): failing frame {1}".format(
                universe.trajectory.filename, ts.frame, failed_frames)
            if not skip_failure:
                raise_from(RuntimeError(err_msg), err)
            else:
                failed_frames += 1
                failed_frames_log.append(ts.frame)
                logging.warning(err_msg)
                continue
        finally:
            pstream.close(force=True)  # deallocate

        # parse propka data structures to get out what we actually want
        confname = mol.conformation_names[0]
        conformation = mol.conformations[confname]
        groups = conformation.get_titratable_groups()

        # extract pka estimates from each residue
        pkas.append([g.pka_value for g in groups])

        # record time
        times.append(ts.time)

    if failed_frames_log:
        logging.warning('number of failed frames = {0}'.format(failed_frames))
        logging.warning('percent failure = {0:.3f}%'.format(
            float(failed_frames) / len(universe.trajectory) * 100))
        logging.warning('failed frames: %r', failed_frames_log)

    # a `pandas.DataFrame` is a good data structure for this data
    df = pd.DataFrame(pkas,
                      index=pd.Float64Index(times, name='time'),
                      columns=[g.atom.resNumb for g in groups])

    return df
Пример #11
0
def get_propka(universe, sel='protein', start=None, stop=None, step=None):
    """Get and store pKas for titrateable residues near the binding site.

    Parameters
    ----------
    universe : :class:`MDAnalysis.Universe`
        Universe to obtain pKas for.
    sel : str, array_like
        Selection string to use for selecting atoms to use from given
        ``universe``. Can also be a numpy array or list of atom indices to use.
    start : int
        Frame of trajectory to start from. `None` means start from beginning.
    stop : int
        Frame of trajectory to end at. `None` means end at trajectory end.
    step : int
        Step by which to iterate through trajectory frames. propka is slow,
        so set according to how finely you need resulting timeseries.

    Results
    -------
    pkas : :class:`pandas.DataFrame`
        DataFrame giving estimated pKa value for each residue for each
        trajectory frame. Residue numbers are given as column labels, times as
        row labels.

    """

    # need AtomGroup to write out for propka
    if isinstance(sel, string_types):
        atomsel = universe.select_atoms(sel)
    elif isinstance(sel, (list, np.array)):
        atomsel = universe.atoms[sel]

    # "filename" for our stream
    # use same name so that propka overwrites
    newname = os.path.join(os.path.dirname(universe.filename), 'current.pdb')

    # progress logging output (because this is slow...)
    pm = mda.lib.log.ProgressMeter(
        universe.trajectory.n_frames,
        format="{step:5d}/{numsteps} t={time:12.3f} ps  "
        "[{percentage:5.1f}%]",
        interval=1)

    times = []
    pkas = []
    for ts in universe.trajectory[start:stop:step]:
        pm.echo(ts.frame, time=ts.time)

        # we create a named stream to write the atoms of interest into
        pstream = mda.lib.util.NamedStream(cStringIO.StringIO(), newname)
        atomsel.write(pstream)

        pstream.reset()  # reset for reading

        # we feed the stream to propka, and it reads it as if it were a file on
        # disk
        mol = pk.single(pstream, optargs=['--quiet'])
        pstream.close(force=True)  # deallocate

        # parse propka data structures to get out what we actually want
        confname = mol.conformation_names[0]
        conformation = mol.conformations[confname]
        groups = conformation.get_titratable_groups()

        # extract pka estimates from each residue
        pkas.append([g.pka_value for g in groups])

        # record time
        times.append(ts.time)

    # a `pandas.DataFrame` is a good data structure for this data
    df = pd.DataFrame(pkas,
                      index=pd.Float64Index(times, name='time'),
                      columns=[g.atom.resNumb for g in groups])

    return df