Пример #1
0
 def __getitem__(self, index: int):
     if not 0 <= index < self._num_examples:
         raise IndexError(index)
     # Read biopython structure
     file_path = self._file_list[index]
     structure = fo.read_sdf(str(file_path),
                             sanitize=False,
                             add_hs=False,
                             remove_hs=False)
     # assemble the item (no bonds)
     item = {
         'atoms': fo.bp_to_df(structure),
         'id': structure.id,
         'file_path': str(file_path),
     }
     # Add bonds if included
     if self._read_bonds:
         mol = fo.read_sdf_to_mol(str(file_path),
                                  sanitize=False,
                                  add_hs=False,
                                  remove_hs=False)
         bonds_df = fo.get_bonds_list_from_mol(mol[0])
         item['bonds'] = bonds_df
     if self._transform:
         item = self._transform(item)
     return item
Пример #2
0
def parse_ensemble(name, ensemble):
    if ensemble is None:
        df = dt.bp_to_df(dt.read_any(name))
    else:
        df = []
        for subunit, f in ensemble.items():
            if isinstance(f, pd.DataFrame):
                curr = f
            else:
                curr = dt.bp_to_df(dt.read_any(f))

            curr['subunit'] = subunit
            df.append(curr)
        df = pd.concat(df)
        df['ensemble'] = name
    return df
Пример #3
0
 def __call__(self, x):
     name = os.path.splitext(x['id'])[0]
     x['id'] = name
     orig_file = self.mut_orig_mapping[name]['original']
     x['original_atoms'] = fo.bp_to_df(
         fo.read_any(os.path.join(self.orig_file_dir, orig_file)))
     x['mutated_atoms'] = x.pop('atoms')
     x['label'] = str(self.labels.loc[name])
     return x
Пример #4
0
 def _pose_to_df(self, pose):
     """
     Convert pyrosetta representation to pandas dataframe representation.
     """
     name = pose.pdb_info().name()
     string_stream = self.pyrosetta.rosetta.std.ostringstream()
     pose.dump_pdb(string_stream)
     f = io.StringIO(string_stream.str())
     parser = Bio.PDB.PDBParser(QUIET=True)
     bp = parser.get_structure(name, f)
     return fo.bp_to_df(bp)
Пример #5
0
    def __getitem__(self, index: int):
        if not 0 <= index < self._num_examples:
            raise IndexError(index)

        file_path = self._file_list[index]

        item = {
            'atoms': fo.bp_to_df(fo.read_any(file_path)),
            'id': file_path.name,
            'file_path': str(file_path),
        }
        if self._transform:
            item = self._transform(item)
        return item
Пример #6
0
def convert_to_hdf5(input_dir, label_file, hdf_file):
    cif_files = fi.find_files(input_dir, 'cif')
    proteins = []
    pockets = []
    pdb_codes = []
    for f in tqdm(cif_files, desc='reading structures'):
        pdb_code = fi.get_pdb_code(f)
        if '_protein' in f:
            pdb_codes.append(pdb_code)
            df = dt.bp_to_df(dt.read_any(f))
            proteins.append(df)
        elif '_pocket' in f:
            df = dt.bp_to_df(dt.read_any(f))
            pockets.append(df)

    print('converting proteins...')
    protein_df = pd.concat(proteins)
    pocket_df = pd.concat(pockets)
    pdb_codes = pd.DataFrame({'pdb': pdb_codes})

    protein_df.to_hdf(hdf_file, 'proteins')
    pocket_df.to_hdf(hdf_file, 'pockets')
    pdb_codes.to_hdf(hdf_file, 'pdb_codes')

    print('converting ligands...')
    sdf_files = fi.find_files(input_dir, 'sdf')
    big_sdf = os.path.join(input_dir, 'all_ligands.sdf')
    dt.combine_sdfs(sdf_files, big_sdf)
    lig_df = PandasTools.LoadSDF(big_sdf, molColName='Mol')
    lig_df.index = pdb_codes
    lig_df.to_hdf(hdf_file, 'ligands')

    print('converting labels...')
    label_df = pd.read_csv(label_file)
    label_df = label_df.set_index('pdb').reindex(pdb_codes)
    label_df.to_hdf(hdf_file, 'labels')
Пример #7
0
    def __getitem__(self, index: int):
        if not 0 <= index < self._num_examples:
            raise IndexError(index)

        file_path = self._file_list[index]
        bp = fo.read_xyz(file_path, gdb=self._gdb)
        if self._gdb:
            bp, data, freq, smiles, inchi = bp
        df = fo.bp_to_df(bp)

        item = {
            'atoms': df,
            'id': bp.id,
            'file_path': str(file_path),
        }
        if self._gdb:
            item['labels'] = self.data_with_subtracted_thchem_energy(data, df)
            item['freq'] = freq
        if self._transform:
            item = self._transform(item)
        return item
Пример #8
0
 def process(self):
     label_file = os.path.join(self.root, 'pdbbind_refined_set_labels.csv')
     label_df = pd.read_csv(label_file)
     i = 0
     for raw_path in self.raw_paths:
         pdb_code = fi.get_pdb_code(raw_path)
         y = torch.FloatTensor([get_label(pdb_code, label_df)])
         if '_ligand' in raw_path:
             mol_graph = graph.mol_to_graph(
                 dt.read_sdf_to_mol(raw_path, add_hs=True)[0])
         elif '_pocket' in raw_path:
             prot_graph = graph.prot_df_to_graph(
                 dt.bp_to_df(dt.read_any(raw_path, name=pdb_code)))
             node_feats, edge_index, edge_feats, pos = graph.combine_graphs(
                 prot_graph, mol_graph, edges_between=True)
             data = Data(node_feats, edge_index, edge_feats, y=y, pos=pos)
             data.pdb = pdb_code
             torch.save(
                 data,
                 os.path.join(self.processed_dir, 'data_{}.pt'.format(i)))
             i += 1
         else:
             continue
Пример #9
0
 def _lookup(self, file_path):
     return seq.get_chain_sequences(fo.bp_to_df(fo.read_any(file_path)))