def mol2_to_dataframes(filename): """Convert a GAFF (or sybyl) mol2 file to a pair of pandas dataframes. Parameters ---------- filename : str Name of mol2 filename Returns ------- atoms_frame : pd.DataFrame DataFrame containing atom information bonds_frame : pd.DataFrame DataFrame containing bond information Notes ----- These dataframes may contain force field information as well as the information necessary for constructing the coordinates and molecular topology. This function has been tested for GAFF and sybyl-style mol2 files but has been primarily tested on GAFF mol2 files. This function does NOT accept multi-structure MOL2 files!!! See Also -------- If you just need the coordinates and bonds, use load_mol2(filename) to get a Trajectory object. """ pd = import_('pandas') with open(filename) as f: data = dict((key, list(grp)) for key, grp in itertools.groupby(f, _parse_mol2_sections)) # Mol2 can have "status bits" at the end of the bond lines. We don't care # about these, but they interfere with using pd_read_table because it looks # like one line has too many columns. So we just regex out the offending # text. status_bit_regex = "BACKBONE|DICT|INTERRES|\|" data["@<TRIPOS>BOND\n"] = [re.sub(status_bit_regex, lambda _: "", s) for s in data["@<TRIPOS>BOND\n"]] if len(data["@<TRIPOS>BOND\n"]) > 1: csv = StringIO() csv.writelines(data["@<TRIPOS>BOND\n"][1:]) csv.seek(0) bonds_frame = pd.read_table(csv, names=["bond_id", "id0", "id1", "bond_type"], index_col=0, header=None, sep="\s*", engine='python') else: bonds_frame = None csv = StringIO() csv.writelines(data["@<TRIPOS>ATOM\n"][1:]) csv.seek(0) atoms_frame = pd.read_csv(csv, sep="\s*", engine='python', header=None) ncols = atoms_frame.shape[1] names=["serial", "name", "x", "y", "z", "atype", "code", "resName", "charge", "status"] atoms_frame.columns = names[:ncols] return atoms_frame, bonds_frame
def mol2_to_dataframes(filename): """Convert a GAFF (or sybyl) mol2 file to a pair of pandas dataframes. Parameters ---------- filename : str Name of mol2 filename Returns ------- atoms_frame : pd.DataFrame DataFrame containing atom information bonds_frame : pd.DataFrame DataFrame containing bond information Notes ----- These dataframes may contain force field information as well as the information necessary for constructing the coordinates and molecular topology. This function has been tested for GAFF and sybyl-style mol2 files but has been primarily tested on GAFF mol2 files. This function does NOT accept multi-structure MOL2 files!!! See Also -------- If you just need the coordinates and bonds, use load_mol2(filename) to get a Trajectory object. """ pd = import_('pandas') with open(filename) as f: data = dict((key, list(grp)) for key, grp in itertools.groupby(f, _parse_mol2_sections)) csv = StringIO() csv.writelines(data["@<TRIPOS>BOND\n"][1:]) csv.seek(0) bonds_frame = pd.read_table(csv, names=["bond_id", "id0", "id1", "bond_type"], index_col=0, header=None, sep="\s*", engine='python') csv = StringIO() csv.writelines(data["@<TRIPOS>ATOM\n"][1:]) csv.seek(0) atoms_frame = pd.read_csv(csv, sep="\s*", engine='python', header=None, names=[ "serial", "name", "x", "y", "z", "atype", "code", "resName", "charge" ]) return atoms_frame, bonds_frame
def mol2_to_dataframes(filename): """Convert a GAFF (or sybyl) mol2 file to a pair of pandas dataframes. Parameters ---------- filename : str Name of mol2 filename Returns ------- atoms_frame : pd.DataFrame DataFrame containing atom information bonds_frame : pd.DataFrame DataFrame containing bond information Notes ----- These dataframes may contain force field information as well as the information necessary for constructing the coordinates and molecular topology. This function has been tested for GAFF and sybyl-style mol2 files but has been primarily tested on GAFF mol2 files. This function does NOT accept multi-structure MOL2 files!!! See Also -------- If you just need the coordinates and bonds, use load_mol2(filename) to get a Trajectory object. """ pd = import_("pandas") with open(filename) as f: data = dict((key, list(grp)) for key, grp in itertools.groupby(f, _parse_mol2_sections)) csv = StringIO() csv.writelines(data["@<TRIPOS>BOND\n"][1:]) csv.seek(0) bonds_frame = pd.read_table(csv, names=["bond_id", "id0", "id1", "bond_type"], index_col=0, header=None, sep="\s*") csv = StringIO() csv.writelines(data["@<TRIPOS>ATOM\n"][1:]) csv.seek(0) atoms_frame = pd.read_csv( csv, sep="\s*", names=["serial", "name", "x", "y", "z", "atype", "code", "resName", "charge"], header=None ) # , usecols=range(1, 10)) # usecols not available in pandas 0.11 return atoms_frame, bonds_frame