Exemplo n.º 1
0
def mol2_to_dataframes(filename):
    """Convert a GAFF (or sybyl) mol2 file to a pair of pandas dataframes.

    Parameters
    ----------
    filename : str
        Name of mol2 filename

    Returns
    -------
    atoms_frame : pd.DataFrame
        DataFrame containing atom information
    bonds_frame : pd.DataFrame
        DataFrame containing bond information

    Notes
    -----
    These dataframes may contain force field information as well as the
    information necessary for constructing the coordinates and molecular
    topology.  This function has been tested for GAFF and sybyl-style
    mol2 files but has been primarily tested on GAFF mol2 files.
    This function does NOT accept multi-structure MOL2 files!!!

    See Also
    --------
    If you just need the coordinates and bonds, use load_mol2(filename)
    to get a Trajectory object.
    """
    pd = import_('pandas')
    with open(filename) as f:
        data = dict((key, list(grp)) for key, grp in itertools.groupby(f, _parse_mol2_sections))

    # Mol2 can have "status bits" at the end of the bond lines. We don't care
    # about these, but they interfere with using pd_read_table because it looks
    # like one line has too many columns. So we just regex out the offending
    # text.
    status_bit_regex = "BACKBONE|DICT|INTERRES|\|"
    data["@<TRIPOS>BOND\n"] = [re.sub(status_bit_regex, lambda _: "", s)
                               for s in data["@<TRIPOS>BOND\n"]]

    if len(data["@<TRIPOS>BOND\n"]) > 1:
        csv = StringIO()
        csv.writelines(data["@<TRIPOS>BOND\n"][1:])
        csv.seek(0)
        bonds_frame = pd.read_table(csv, names=["bond_id", "id0", "id1", "bond_type"],
            index_col=0, header=None, sep="\s*", engine='python')
    else:
        bonds_frame = None

    csv = StringIO()
    csv.writelines(data["@<TRIPOS>ATOM\n"][1:])
    csv.seek(0)
    atoms_frame = pd.read_csv(csv, sep="\s*", engine='python',  header=None)
    ncols = atoms_frame.shape[1]
    names=["serial", "name", "x", "y", "z", "atype", "code", "resName", "charge", "status"]
    atoms_frame.columns = names[:ncols]
    
    return atoms_frame, bonds_frame
Exemplo n.º 2
0
def mol2_to_dataframes(filename):
    """Convert a GAFF (or sybyl) mol2 file to a pair of pandas dataframes.

    Parameters
    ----------
    filename : str
        Name of mol2 filename

    Returns
    -------
    atoms_frame : pd.DataFrame
        DataFrame containing atom information
    bonds_frame : pd.DataFrame
        DataFrame containing bond information

    Notes
    -----
    These dataframes may contain force field information as well as the
    information necessary for constructing the coordinates and molecular
    topology.  This function has been tested for GAFF and sybyl-style
    mol2 files but has been primarily tested on GAFF mol2 files.
    This function does NOT accept multi-structure MOL2 files!!!

    See Also
    --------
    If you just need the coordinates and bonds, use load_mol2(filename)
    to get a Trajectory object.
    """
    pd = import_('pandas')
    with open(filename) as f:
        data = dict((key, list(grp)) for key, grp in itertools.groupby(f, _parse_mol2_sections))

    # Mol2 can have "status bits" at the end of the bond lines. We don't care
    # about these, but they interfere with using pd_read_table because it looks
    # like one line has too many columns. So we just regex out the offending
    # text.
    status_bit_regex = "BACKBONE|DICT|INTERRES|\|"
    data["@<TRIPOS>BOND\n"] = [re.sub(status_bit_regex, lambda _: "", s)
                               for s in data["@<TRIPOS>BOND\n"]]

    if len(data["@<TRIPOS>BOND\n"]) > 1:
        csv = StringIO()
        csv.writelines(data["@<TRIPOS>BOND\n"][1:])
        csv.seek(0)
        bonds_frame = pd.read_table(csv, names=["bond_id", "id0", "id1", "bond_type"],
            index_col=0, header=None, sep="\s*", engine='python')
    else:
        bonds_frame = None

    csv = StringIO()
    csv.writelines(data["@<TRIPOS>ATOM\n"][1:])
    csv.seek(0)
    atoms_frame = pd.read_csv(csv, sep="\s*", engine='python',  header=None)
    ncols = atoms_frame.shape[1]
    names=["serial", "name", "x", "y", "z", "atype", "code", "resName", "charge", "status"]
    atoms_frame.columns = names[:ncols]
    
    return atoms_frame, bonds_frame
Exemplo n.º 3
0
def mol2_to_dataframes(filename):
    """Convert a GAFF (or sybyl) mol2 file to a pair of pandas dataframes.

    Parameters
    ----------
    filename : str
        Name of mol2 filename

    Returns
    -------
    atoms_frame : pd.DataFrame
        DataFrame containing atom information
    bonds_frame : pd.DataFrame
        DataFrame containing bond information

    Notes
    -----
    These dataframes may contain force field information as well as the
    information necessary for constructing the coordinates and molecular
    topology.  This function has been tested for GAFF and sybyl-style
    mol2 files but has been primarily tested on GAFF mol2 files.
    This function does NOT accept multi-structure MOL2 files!!!

    See Also
    --------
    If you just need the coordinates and bonds, use load_mol2(filename)
    to get a Trajectory object.
    """
    pd = import_('pandas')
    with open(filename) as f:
        data = dict((key, list(grp))
                    for key, grp in itertools.groupby(f, _parse_mol2_sections))

    csv = StringIO()
    csv.writelines(data["@<TRIPOS>BOND\n"][1:])
    csv.seek(0)
    bonds_frame = pd.read_table(csv,
                                names=["bond_id", "id0", "id1", "bond_type"],
                                index_col=0,
                                header=None,
                                sep="\s*",
                                engine='python')

    csv = StringIO()
    csv.writelines(data["@<TRIPOS>ATOM\n"][1:])
    csv.seek(0)
    atoms_frame = pd.read_csv(csv,
                              sep="\s*",
                              engine='python',
                              header=None,
                              names=[
                                  "serial", "name", "x", "y", "z", "atype",
                                  "code", "resName", "charge"
                              ])
    return atoms_frame, bonds_frame
Exemplo n.º 4
0
def mol2_to_dataframes(filename):
    """Convert a GAFF (or sybyl) mol2 file to a pair of pandas dataframes.


    Parameters
    ----------
    filename : str
        Name of mol2 filename

    Returns
    -------
    atoms_frame : pd.DataFrame
        DataFrame containing atom information
    bonds_frame : pd.DataFrame
        DataFrame containing bond information
    
    Notes
    -----
    These dataframes may contain force field information as well as the
    information necessary for constructing the coordinates and molecular
    topology.  This function has been tested for GAFF and sybyl-style 
    mol2 files but has been primarily tested on GAFF mol2 files. 
    This function does NOT accept multi-structure MOL2 files!!!    
    
    See Also
    --------
    If you just need the coordinates and bonds, use load_mol2(filename)
    to get a Trajectory object.
    """
    pd = import_("pandas")
    with open(filename) as f:
        data = dict((key, list(grp)) for key, grp in itertools.groupby(f, _parse_mol2_sections))

    csv = StringIO()
    csv.writelines(data["@<TRIPOS>BOND\n"][1:])
    csv.seek(0)
    bonds_frame = pd.read_table(csv, names=["bond_id", "id0", "id1", "bond_type"], index_col=0, header=None, sep="\s*")

    csv = StringIO()
    csv.writelines(data["@<TRIPOS>ATOM\n"][1:])
    csv.seek(0)
    atoms_frame = pd.read_csv(
        csv, sep="\s*", names=["serial", "name", "x", "y", "z", "atype", "code", "resName", "charge"], header=None
    )  # , usecols=range(1, 10))  # usecols not available in pandas 0.11
    return atoms_frame, bonds_frame