def save_sdf(mols, predictions, filename, id_col=None, write_all=False, prediction_col='Prediction'): sdw = SDWriter(filename) props = list( functools.reduce(operator.or_, map(lambda x: set(x.GetPropNames()), mols))) prediction_items = [prediction_col] if id_col is not None and id_col in props: props.remove(id_col) else: id_col = None for i, mol in enumerate(mols): if not write_all: for prop in props: mol.ClearProp(prop) for prediction_item in prediction_items: mol.SetIntProp(prediction_item, int(predictions[i])) sdw.write(mol) sdw.close() pass
def write_sdf_file(scaffold_graph, output_file): """Write an SDF file from a scaffoldgraph Parameters ---------- scaffold_graph (sg.ScaffoldGraph): graph to be converted output_file (str): path to output file """ N = scaffold_graph.num_scaffold_nodes sorted_scaffolds = sorted(scaffold_graph.get_scaffold_nodes(data=True), key=lambda x: x[1]['hierarchy']) mapping = dict(zip([s[0] for s in sorted_scaffolds], range(0, N))) writer = SDWriter(output_file) for scaffold, data in sorted_scaffolds: molecule = MolFromSmiles(scaffold) if molecule is not None: subscaffolds = list(scaffold_graph.predecessors(scaffold)) molecule.SetProp('_Name', mapping[scaffold]) molecule.SetIntProp('HIERARCHY', scaffold_graph.nodes[scaffold]['HIERARCHY']) molecule.SetProp('SMILES', scaffold) molecule.SetProp( 'SUBSCAFFOLDS', ', '.join([str(mapping[s]) for s in subscaffolds])) writer.write(molecule) writer.close()
def WriteSDF(df,out,molColumn,properties=None,allNumeric=False,titleColumn=None): '''Write an SD file for the molecules in the dataframe. Dataframe columns can be exported as SDF tags if specific in the "properties" list. The "allNumeric" flag allows to automatically include all numeric columns in the output. "titleColumn" can be used to select a column to serve as molecule title. It can be set to "RowID" to use the dataframe row key as title. ''' writer = SDWriter(out) if properties is None: properties=[] if allNumeric: properties.extend([dt for dt in df.dtypes.keys() if (np.issubdtype(df.dtypes[dt],float) or np.issubdtype(df.dtypes[dt],int))]) if molColumn in properties: properties.remove(molColumn) if titleColumn in properties: properties.remove(titleColumn) writer.SetProps(properties) for row in df.iterrows(): mol = copy.deepcopy(row[1][molColumn]) if titleColumn is not None: if titleColumn == 'RowID': mol.SetProp('_Name',str(row[0])) else: mol.SetProp('_Name',row[1][titleColumn]) for p in properties: mol.SetProp(p,str(row[1][p])) writer.write(mol) writer.close()
def WriteSDF(df, out, molColName='ROMol', idName=None, properties=None, allNumeric=False): '''Write an SD file for the molecules in the dataframe. Dataframe columns can be exported as SDF tags if specified in the "properties" list. "properties=list(df.columns)" would export all columns. The "allNumeric" flag allows to automatically include all numeric columns in the output. User has to make sure that correct data type is assigned to column. "idName" can be used to select a column to serve as molecule title. It can be set to "RowID" to use the dataframe row key as title. ''' close = None if isinstance(out, string_types): if out.lower()[-3:] == ".gz": import gzip out = gzip.open(out, "wb") close = out.close writer = SDWriter(out) if properties is None: properties = [] else: properties = list(properties) if allNumeric: properties.extend([ dt for dt in df.dtypes.keys() if (np.issubdtype(df.dtypes[dt], float) or np.issubdtype(df.dtypes[dt], int)) ]) if molColName in properties: properties.remove(molColName) if idName in properties: properties.remove(idName) writer.SetProps(properties) for row in df.iterrows(): # make a local copy I can modify mol = Chem.Mol(row[1][molColName]) if idName is not None: if idName == 'RowID': mol.SetProp('_Name', str(row[0])) else: mol.SetProp('_Name', str(row[1][idName])) for p in properties: cell_value = row[1][p] # Make sure float does not get formatted in E notation if np.issubdtype(type(cell_value), float): s = '{:f}'.format(cell_value).rstrip( "0") # "f" will show 7.0 as 7.00000 if s[-1] == ".": s += "0" # put the "0" back on if it's something like "7." mol.SetProp(p, s) else: mol.SetProp(p, str(cell_value)) writer.write(mol) writer.close() if close is not None: close()
def classify(sdf, label, lambdas): new_filename = "%s_class.sdf" % sdf.split('.sdf')[0] new_label = label + "_class" sdm = ForwardSDMolSupplier(sdf, strictParsing=False, removeHs=False, sanitize=False) sdw = SDWriter(new_filename) counter = -1 i = 0 for mol in sdm: print(i) sys.stdout.flush() i += 1 counter += 1 if mol is None: print("%d rdkit couldn't read molecule" % counter, file=sys.stderr) sys.stderr.flush() continue c = None prop = floatify(mol.GetProp(label)) if prop is None: print("couldn't convert %s to float or int...skip" % mol.GetProp(label), file=sys.stderr) sys.stderr.flush() continue for k, l in lambdas.items(): if l(prop): c = k print("hit %s" % k) sys.stdout.flush() break if c is None: print("%d no prop range matched '%s' ..skip" % (counter, mol.GetProp(label)), prop, type(prop), file=sys.stderr) sys.stderr.flush() sys.stdout.flush() continue mol.SetProp(new_label, c) try: sdw.write(mol) except: print( "couldn't write mol %d to file, try to build mol from smiles" % i, file=sys.stderr) mol = MolFromSmiles(mol.GetProp("SMILES")) AllChem.Compute2DCoords(mol) mol.SetProp(new_label, c) try: sdw.write(mol) except: print("couldn't write mol %d to file...skip" % i, file=sys.stderr) sdw.close()
def WriteSDF(df, out, molColName='ROMol', idName=None, properties=None, allNumeric=False): '''Write an SD file for the molecules in the dataframe. Dataframe columns can be exported as SDF tags if specified in the "properties" list. "properties=list(df.columns)" would export all columns. The "allNumeric" flag allows to automatically include all numeric columns in the output. User has to make sure that correct data type is assigned to column. "idName" can be used to select a column to serve as molecule title. It can be set to "RowID" to use the dataframe row key as title. ''' close = None if isinstance(out, string_types): if out.lower()[-3:] == ".gz": import gzip if PY3: out = gzip.open(out, "wt") else: out = gzip.open(out, "wb") close = out.close writer = SDWriter(out) if properties is None: properties = [] else: properties = list(properties) if allNumeric: properties.extend([ dt for dt in df.dtypes.keys() if (np.issubdtype(df.dtypes[dt], float) or np.issubdtype(df.dtypes[dt], int)) ]) if molColName in properties: properties.remove(molColName) if idName in properties: properties.remove(idName) writer.SetProps(properties) for row in df.iterrows(): # make a local copy I can modify mol = Chem.Mol(row[1][molColName]) if idName is not None: if idName == 'RowID': mol.SetProp('_Name', str(row[0])) else: mol.SetProp('_Name', str(row[1][idName])) for p in properties: cell_value = row[1][p] # Make sure float does not get formatted in E notation if np.issubdtype(type(cell_value), float): s = '{:f}'.format(cell_value).rstrip("0") # "f" will show 7.0 as 7.00000 if s[-1] == ".": s += "0" # put the "0" back on if it's something like "7." mol.SetProp(p, s) else: mol.SetProp(p, str(cell_value)) writer.write(mol) writer.close() if close is not None: close()
def split(sdf, label_col, folder, splitfold=5): """ Stratified splitting of dataset into k-folds :param mols: Input molecules as dataset :param label_col: Column name of labels for stratification :param folder: Folder/model name :param splitfold: k number of folds :return: """ if folder is None: sdf_path = pathlib.Path(sdf) sdf_name = sdf_path.name.partition('.')[0] folder = sdf_path.parent.joinpath(sdf_name) if not folder.is_dir(): folder.mkdir() folder = folder.absolute() else: p = pathlib.Path(folder) if not p.is_dir(): p.mkdir() train_files = [] test_files = [] sdm = SDMolSupplier(sdf) mols = [x for x in sdm] labels = [] for i in range(len(mols)): labels.append(mols[i].GetProp(label_col)) skf = StratifiedKFold(n_splits=splitfold) fold = 0 for train_ix, test_ix in skf.split(mols, labels): test_set_fn = "{}/testset_{}.sdf".format(folder, fold) train_set_fn = "{}/trainset_{}.sdf".format(folder, fold) sdw_train = SDWriter(train_set_fn) for i in train_ix: sdw_train.write(mols[i]) sdw_train.close() train_files.append(train_set_fn) sdw_test = SDWriter(test_set_fn) for i in test_ix: sdw_test.write(mols[i]) sdw_test.close() test_files.append(test_set_fn) fold += 1 return {'train_files': train_files, 'test_files': test_files}, folder
def preprocess_mols(mols, session_id): session_dir = join('uploads', session_id) mols = np.array(mols) df = pd.DataFrame([m.GetPropsAsDict() for m in mols]) df['NN'] = np.nan exp_cols = [c for c in df.columns if 'experimental' in c] # if any experiemntal value is known for a given molecule, the molecule is # assumed to be known experiemntal_mask = np.any(~pd.isna(df[exp_cols]), 1) # TO DO - check if there is no issues with preserving order test_mols = mols[~experiemntal_mask] test_mols_ids = df.index[~experiemntal_mask] known_mols = mols[experiemntal_mask] known_mols_ids = df.index[experiemntal_mask] # get indices of NNs for molecules that don't have experiemntal data # ie. that were not previously tested molecules if len(test_mols): test_nns_idx, similarity = get_Tanimoto_NNs(test_mols, known_mols, 3, nns=50, return_sim=True) test_nns_ids = test_mols_ids[test_nns_idx] formatted = list(map(repr, test_nns_ids.tolist())) df.at[~experiemntal_mask.values, 'NN'] = formatted df.at[~experiemntal_mask.values, 'Similarity_Tanimoto'] = similarity[:,0] # get indices of NNs for molecules that have experiemntal data if len(known_mols): known_nns_idx, similarity = get_Tanimoto_NNs(known_mols, known_mols, 3, order=1, nns=50, return_sim=True) known_nns_ids = known_mols_ids[known_nns_idx] formatted = list(map(repr, known_nns_ids.tolist())) df.at[experiemntal_mask.values, 'NN'] = formatted df.at[experiemntal_mask.values, 'Similarity_Tanimoto'] = similarity[:,0] # Save molecules as dataset ## Check if the dir already exists if exists(session_dir): raise RuntimeError('The session directory %s already exists!'%session_dir) else: makedirs(session_dir) ## Write id of Nearest Neighbour to SDF properties and save each mol to ## separate file, the filenames are equal to index in dataset for idx, mol in zip(df.index, mols): writer = SDWriter(join(session_dir, '%d.sdf'%idx)) mol.SetProp('NN', '%s'%df.loc[idx]['NN']) mol.SetProp('Similarity_Tanimoto', '%s'%df.loc[idx]['Similarity_Tanimoto']) writer.write(mol) writer.close() return session_dir
def csv_to_sdf(csv_file, sdf_file, smiles_col, class_col, delim=','): sdw = SDWriter(sdf_file) with open(csv_file) as fh: for i, line in enumerate(fh.readlines()): if i == 0: continue line_split = line.strip().split(delim) smiles = line_split[smiles_col].replace('"', '') act_class = line_split[class_col].replace('"', '') act_newLabel = activity_label_to_id_map[act_class] mol = MolFromSmiles(smiles) mol.SetProp("TL", act_newLabel) sdw.write(mol) sdw.close()
def WriteSDF(df, out, molColName='ROMol', idName=None, properties=None, allNumeric=False): '''Write an SD file for the molecules in the dataframe. Dataframe columns can be exported as SDF tags if specified in the "properties" list. "properties=list(df.columns)" would export all columns. The "allNumeric" flag allows to automatically include all numeric columns in the output. User has to make sure that correct data type is assigned to column. "idName" can be used to select a column to serve as molecule title. It can be set to "RowID" to use the dataframe row key as title. ''' writer = SDWriter(out) if properties is None: properties = [] if allNumeric: properties.extend([ dt for dt in df.dtypes.keys() if (np.issubdtype(df.dtypes[dt], float) or np.issubdtype(df.dtypes[dt], int)) ]) if molColName in properties: properties.remove(molColName) if idName in properties: properties.remove(idName) writer.SetProps(properties) for row in df.iterrows(): mol = copy.deepcopy(row[1][molColName]) # Remove embeded props for prop in mol.GetPropNames(): mol.ClearProp(prop) if idName is not None: if idName == 'RowID': mol.SetProp('_Name', str(row[0])) else: mol.SetProp('_Name', str(row[1][idName])) for p in properties: cell_value = row[1][p] # Make sure float does not get formatted in E notation if np.issubdtype(type(cell_value), float): mol.SetProp(p, '{:f}'.format(cell_value).rstrip('0')) else: mol.SetProp(p, str(cell_value)) writer.write(mol) writer.close()
def write_sdf_file(scaffold_graph, output_file): """Write an SDF file from a ScaffoldGraph. All scaffolds in the scaffoldgraph are written to the SDF, while molecules are ignored. Scaffolds are sorted in ascending order according to their hierarchy level. The output follows the standard SDF specification with the added property fields: TITLE field: scaffold ID SUBSCAFFOLDS field: list of sub-scaffold IDs HIERARCHY field: hierarchy level of scaffold SMILES field: scaffold canonical SMILES Parameters ---------- scaffold_graph : scaffoldgraph.core.ScaffoldGraph ScaffoldGraph to be written to an SDF. output_file : str Filepath to an output file. """ N = scaffold_graph.num_scaffold_nodes sorted_scaffolds = sorted(scaffold_graph.get_scaffold_nodes(data=True), key=lambda x: x[1]['hierarchy']) mapping = dict(zip([s[0] for s in sorted_scaffolds], range(0, N))) writer = SDWriter(output_file) for scaffold, data in sorted_scaffolds: molecule = MolFromSmiles(scaffold) if molecule is not None: subscaffolds = list(scaffold_graph.predecessors(scaffold)) molecule.SetProp('_Name', mapping[scaffold]) molecule.SetIntProp('HIERARCHY', scaffold_graph.nodes[scaffold]['HIERARCHY']) molecule.SetProp('SMILES', scaffold) molecule.SetProp( 'SUBSCAFFOLDS', ', '.join([str(mapping[s]) for s in subscaffolds])) writer.write(molecule) writer.close()
def WriteSDF(df, out, molColName='ROMol', idName=None, properties=None, allNumeric=False): '''Write an SD file for the molecules in the dataframe. Dataframe columns can be exported as SDF tags if specified in the "properties" list. "properties=list(df.columns)" would export all columns. The "allNumeric" flag allows to automatically include all numeric columns in the output. User has to make sure that correct data type is assigned to column. "idName" can be used to select a column to serve as molecule title. It can be set to "RowID" to use the dataframe row key as title. ''' writer = SDWriter(out) if properties is None: properties=[] if allNumeric: properties.extend([dt for dt in df.dtypes.keys() if (np.issubdtype(df.dtypes[dt],float) or np.issubdtype(df.dtypes[dt],int))]) if molColName in properties: properties.remove(molColName) if idName in properties: properties.remove(idName) writer.SetProps(properties) for row in df.iterrows(): mol = copy.deepcopy(row[1][molColName]) # Remove embeded props for prop in mol.GetPropNames(): mol.ClearProp(prop) if idName is not None: if idName == 'RowID': mol.SetProp('_Name',str(row[0])) else: mol.SetProp('_Name',str(row[1][idName])) for p in properties: cell_value = row[1][p] # Make sure float does not get formatted in E notation if np.issubdtype(type(cell_value),float): mol.SetProp(p,'{:f}'.format(cell_value).rstrip('0')) else: mol.SetProp(p,str(cell_value)) writer.write(mol) writer.close()
def mols_to_sdf(mols: List[Mol], path: str) -> Optional[str]: """ Writes all molecules from `mols` to an SDF with the given path. Parameters ---------- mols : List[Mol] List of RDKit mol objects to write into a SDF path : str The path, where the SDF should be written to Returns ------- Optional[str] None, if all went fine. A string containing an error message otherwise. """ try: sdw = SDWriter(path) for mol in mols: sdw.write(mol) sdw.close() except OSError: return f'Could not create output file: {abspath(path)}'
def WriteSDF(df, out, molColumn, properties=None, allNumeric=False, titleColumn=None): '''Write an SD file for the molecules in the dataframe. Dataframe columns can be exported as SDF tags if specific in the "properties" list. The "allNumeric" flag allows to automatically include all numeric columns in the output. "titleColumn" can be used to select a column to serve as molecule title. It can be set to "RowID" to use the dataframe row key as title. ''' writer = SDWriter(out) if properties is None: properties = [] if allNumeric: properties.extend([ dt for dt in df.dtypes.keys() if (np.issubdtype(df.dtypes[dt], float) or np.issubdtype(df.dtypes[dt], int)) ]) if molColumn in properties: properties.remove(molColumn) if titleColumn in properties: properties.remove(titleColumn) writer.SetProps(properties) for row in df.iterrows(): mol = copy.deepcopy(row[1][molColumn]) if titleColumn is not None: if titleColumn == 'RowID': mol.SetProp('_Name', str(row[0])) else: mol.SetProp('_Name', row[1][titleColumn]) for p in properties: mol.SetProp(p, str(row[1][p])) writer.write(mol) writer.close()
X, y = make_dataset(f'{prediction_set}.sdf', data_dir=env_var, features=features, name_col=name_col, endpoint=endpoint, threshold=threshold, cache=False) y = y.reindex(X_pred.index) y[y.isnull()] = final_preds y.to_csv(os.path.join( data_dir, 'predictions', f'{prediction_set}_{features}_{endpoint}_{threshold}_no_gaps.csv'), header=['Activities']) for molecule in molecules: if not molecule.HasProp(endpoint): molecule.SetProp(endpoint, str(y.loc[molecule.GetProp(name_col)])) else: for molecule in molecules: molecule.SetProp(endpoint, str(final_preds.loc[molecule.GetProp(name_col)])) w = SDWriter(os.path.join(data_dir, f'{prediction_set}_with_predictions.sdf')) for molecule in molecules: w.write(molecule) w.close()
""" This script combines multiple SDF files specified via commandline arguments to one SDF file. It will be saved as "combined_training_datasets.sdf". The script tries to preserve the information about the original dataset name by splitting the file name at "_" and saving the first element of this split as SDF file tag named "original_dataset". """ from sys import argv from rdkit.Chem import SDWriter, SDMolSupplier __author__ = 'Marcel Baltruschat' __copyright__ = 'Copyright © 2020' __license__ = 'MIT' __version__ = '1.0.0' sdw = SDWriter('combined_training_datasets.sdf') for f in argv[1:]: dsname = f.split('_')[0] sdm = SDMolSupplier(f) for mol in sdm: mol.SetProp('original_dataset', dsname) sdw.write(mol) sdw.close()
class AggregateCLI(object): """Aggregate output TSV files (CLI).""" def __init__(self, args): self.args = args self.inputs = args.input if args.sdf: rdlogger.setLevel(4) self.output = SDWriter(args.output) else: self.output = open(args.output, 'w') self.mol_map = open(args.map_mols, 'w') if args.map_mols else None if self.mol_map: self.mol_map.write('MOLECULE_ID\tSCAFFOLD_ID\n') self.ann_map = open(args.map_annotations, 'w') if args.map_annotations else None if self.ann_map: self.ann_map.write('SCAFFOLD_ID\tANNOTATIONS\n') self.current_id = 0 self.duplicates = 0 self.table = {} def aggregate(self): if not self.args.sdf: self.output.write('ID\tHIERARCHY\tSMILES\tSUBSCAFFOLDS\n') for file in self.inputs: logger.info(f'Processing file: {file}...') with open(file, 'r') as fw: self.process_file(fw) def process_file(self, file): reader = ScaffoldFileIterator(file) for scaffold in reader: s_smiles = scaffold.smiles write = False if s_smiles in self.table: scaffold.id = self.table[s_smiles]['ID'] else: scaffold.id = self.current_id self.table[s_smiles] = dict(ID=self.current_id, PARENTS=[]) self.current_id += 1 write = True missing = [] for idx, parent in enumerate(scaffold.subscaffolds): p_smiles = parent.smiles if p_smiles in self.table: parent.id = self.table[p_smiles]['ID'] else: missing.append(idx) for m in sorted(missing, reverse=True): del scaffold.subscaffolds[m] if write is True: self.write_scaffold(scaffold) self.write_extra_outputs(scaffold) else: self.duplicates += 1 self.write_extra_outputs(scaffold) def write_scaffold(self, scaffold): subscaffolds = ', '.join([str(s.id) for s in scaffold.subscaffolds]) if self.args.sdf: molecule = MolFromSmiles(scaffold.smiles) if molecule is not None: molecule.SetProp('_Name', str(scaffold.id)) molecule.SetIntProp('HIERARCHY', scaffold.hierarchy) molecule.SetProp('SMILES', scaffold.smiles) molecule.SetProp('SUBSCAFFOLDS', subscaffolds) self.output.write(molecule) else: logger.warning(f'Failed to parse scaffold: {scaffold.smiles}') else: self.output.write('{0}\t{1}\t{2}\t{3}\n'.format( scaffold.id, scaffold.hierarchy, scaffold.smiles, subscaffolds)) def write_extra_outputs(self, scaffold): # Write molecule --> scaffold ID file if self.mol_map is not None: for molecule in scaffold.molecules: self.mol_map.write('{0}\t{1}\n'.format( molecule, scaffold.id)) # Write scaffold ID --> annotation file if self.ann_map is not None: for annotation in scaffold.annotations: self.ann_map.write('{0}\t{1}\n'.format(scaffold.id, annotation)) def __enter__(self): return self def __exit__(self, exc_type, exc_value, traceback): self.output.close() if self.mol_map is not None: self.mol_map.close() if self.ann_map is not None: self.ann_map.close()
def write_sdf(mol, path): writer = SDWriter(path) writer.write(mol) writer.close()
def process( self, input_file: str, output_file: str = "", output_file_sdf: str = "", sdf_append: bool = False, #images_prefix: str = "", format_output: bool = True, write_header: bool = True, osra_output_format: str = "", output_formats: list = None, dry_run: bool = False, csv_delimiter: str = ";", use_gm: bool = True, gm_dpi: int = 300, gm_trim: bool = True, n_jobs: int = -1, input_type: str = "", standardize_mols: bool = True, annotate: bool = True, chemspider_token: str = "", custom_page: int = 0, continue_on_failure: bool = False) -> OrderedDict: r""" Process the input file with OSRA. Parameters ---------- input_file : str Path to file to be processed by OSRA. output_file : str File to write output in. output_file_sdf : str | File to write SDF output in. "sdf" output format hasn't to be in `output_formats` to write SDF output. | If "sdf_osra" output format is requested, suffix "-osra.sdf" will be added. sdf_append : bool If True, append new molecules to existing SDF file or create new one if doesn't exist. NOT IMPLEMENTED | images_prefix : str Prefix for images of extracted compounds which will be written. format_output : bool | If True, the value of "content" key of returned dict will be list of OrderedDicts. | If True and `output_file` is set, the CSV file will be written. | If False, the value of "content" key of returned dict will be None. write_header : bool If True and if `output_file` is set and `output_format` is True, write a CSV write_header. osra_output_format : str | Output format from OSRA. Temporarily overrides the option `output_format` set during instantiation (in __init__). | Choices: "smi", "can", "sdf" | If "sdf", additional information like coordinates cannot be retrieved (not implemented yet). output_formats : list | If True and `format_output` is also True, this specifies which molecule formats will be output. | You can specify more than one format, but only one format from OSRA. This format must be also set with `output_format` in __init__ or with `osra_output_format` here. | When output produces by OSRA is unreadable by RDKit, you can at least have that output from OSRA. | Default value: ["smiles"] +-----------------+--------------+--------------------------------------------------------------------------------------------+ | Value | Source | Note | +=================+==============+============================================================================================+ | smiles | RDKit | canonical | +-----------------+--------------+--------------------------------------------------------------------------------------------+ | smiles_osra | OSRA ("smi") | SMILES | +-----------------+--------------+--------------------------------------------------------------------------------------------+ | smiles_can_osra | OSRA ("can") | canonical SMILES | +-----------------+--------------+--------------------------------------------------------------------------------------------+ | inchi | RDKit | Not every molecule can be converted to InChI (it doesn`t support wildcard characters etc.) | +-----------------+--------------+--------------------------------------------------------------------------------------------+ | inchikey | RDKit | The same applies as for "inchi". | +-----------------+--------------+--------------------------------------------------------------------------------------------+ | sdf | RDKit | If present, an additional SDF file will be created. | +-----------------+--------------+--------------------------------------------------------------------------------------------+ | sdf_osra | OSRA ("sdf") | If present, an additional SDF file will be created. | +-----------------+--------------+--------------------------------------------------------------------------------------------+ dry_run : bool If True, only return list of commands to be called by subprocess. csv_delimiter : str Delimiter for output CSV file. use_gm : bool | If True, use GraphicsMagick to convert PDF to temporary PNG images before processing. | If False, OSRA will use it's own conversion of PDF to image. | Using gm is more reliable since OSRA (v2.1.0) is showing wrong information when converting directly from PDF (namely: coordinates, bond length and possibly more ones) and also there are sometimes incorrectly recognised structures. gm_dpi : int How many DPI will temporary PNG images have. gm_trim : bool If True, gm will trim the temporary PNG images. n_jobs : int | If `use_gm` and input file is PDF, how many jobs to use for OSRA processing of temporary PNG images. | If -1 all CPUs are used. | If 1 is given, no parallel computing code is used at all, which is useful for debugging. | For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used. input_type : str | When empty, input (MIME) type will be determined from magic bytes. | Or you can specify "pdf" or "image" and magic bytes check will be skipped. standardize_mols : bool If True and `format_output` is also True, use molvs (https://github.com/mcs07/MolVS) to standardize molecules. annotate : bool | If True, try to annotate entities in PubChem and ChemSpider. Compound IDs will be assigned by searching with each identifier, separately for SMILES, InChI etc. | If entity has InChI key yet, prefer it in searching. | If "*" is present in SMILES, skip annotation. chemspider_token : str Your personal token for accessing the ChemSpider API. Make account there to obtain it. custom_page : bool When `use_gm` is False, this will set the page for all extracted compounds. continue_on_failure : bool | If True, continue running even if OSRA returns non-zero exit code. | If False and error occurs, print it and return. Returns ------- dict Keys: - stdout: str ... standard output from OSRA - stderr: str ... standard error output from OSRA - exit_code: int ... exit code from OSRA - content: - list of OrderedDicts ... when `format_output` is True. - None ... when `format_output` is False | If `osra_output_format` is "sdf", additional information like 'bond_length' cannot be retrieved. | If `use_gm` is True then stdout, stderr and exit_code will be lists containing items from each temporary image extracted by OSRA. Notes ----- Only with `format_output` set to True you can use molecule standardization and more molecule formats. Otherwise you will only get raw stdout from OSRA (which can also be written to file if `output_file` is set). """ options_internal = self.options_internal.copy() osra_smiles_outputs = ["smi", "can"] # OSRA output format check if osra_output_format: options_internal["output_format"] = osra_output_format else: osra_output_format = options_internal["output_format"] osra_valid_output_formats = { "can": "smiles_can_osra", "smi": "smiles_osra", "sdf": "sdf_osra" } if osra_output_format not in osra_valid_output_formats: raise ValueError( "Unknown OSRA output format. Possible values: {}".format( osra_valid_output_formats.values())) if osra_output_format == "sdf": self.logger.warning( "OSRA's output format is set to \"sdf\" so additional information like coordinates cannot be retrieved." ) # output formats check is_output_sdf = False is_output_sdf_osra = False if not output_formats: output_formats = ["smiles"] else: output_formats = sorted(list(set(output_formats))) possible_output_formats = ["smiles", "inchi", "inchikey", "sdf"] output_formats = [ x for x in output_formats if x in possible_output_formats or x == osra_valid_output_formats[osra_output_format] ] if ("sdf" in output_formats or "sdf_osra" in output_formats) and not output_file_sdf: self.logger.warning( "Cannot write SDF output: 'output_file_sdf' is not set.") if output_file_sdf: is_output_sdf = True if "sdf_osra" in output_formats and osra_output_format == "sdf" and output_file_sdf: is_output_sdf_osra = True if ("smiles_osra" in output_formats or "smiles_can_osra" in output_formats) and osra_output_format == "sdf": try: output_formats.remove("smiles_osra") except ValueError: pass try: output_formats.remove("smiles_can_osra") except ValueError: pass self.logger.warning( "SMILES or canonical SMILES output from OSRA is requested, but OSRA's output format is \"{}\"." .format(osra_output_format)) # input file type check possible_input_types = ["pdf", "image"] if not input_type: input_type = get_input_file_type(input_file) if input_type not in possible_input_types: use_gm = False self.logger.warning( "Input file MIME type ('{}') is not one of {}. You can specify 'input_type' directly (see docstring)." .format(input_type, possible_input_types)) elif input_type not in possible_input_types: raise ValueError("Possible 'input_type' values are {}".format( possible_input_types)) #options = ChainMap({k: v for k, v in {"images_prefix": images_prefix}.items() if v}, # options_internal) if annotate: if not chemspider_token: self.logger.warning( "Cannot perform annotation in ChemSpider: 'chemspider_token' is empty." ) [ output_formats.append(x) for x in ["smiles", "inchi", "inchikey"] if x not in output_formats ] output_formats = sorted(output_formats) commands, _, _ = self.build_commands(options_internal, self._OPTIONS_REAL, self.path_to_binary) commands.extend( ["--bond", "--coordinates", "--page", "--guess", "--print"]) if dry_run: return " ".join(commands) osra_output_list = [] if input_type == "image" or not use_gm: osra_output_list.append( self._process(input_file, commands, page=custom_page if custom_page else 1)) elif input_type == "pdf": with tempfile.TemporaryDirectory() as temp_dir: stdout, stderr, exit_code = pdf_to_images(input_file, temp_dir, dpi=gm_dpi, trim=gm_trim) osra_output_list = Parallel(n_jobs=n_jobs)( delayed(self._process)( temp_image_file, commands, page=page) for temp_image_file, page in get_temp_images(temp_dir)) # summarize OSRA results to_return = { "stdout": [], "stderr": [], "exit_code": [], "content": None, "pages": [] } for result in osra_output_list: if result["stdout"]: to_return["stdout"].append(result["stdout"]) to_return["stderr"].append(result["stderr"]) to_return["exit_code"].append(result["exit_code"]) to_return["pages"].append(result["page"]) if not continue_on_failure: errors = [(page + 1, error) for page, (exit_code, error) in enumerate( zip(to_return["exit_code"], to_return["stderr"])) if exit_code > 0] if errors: self.logger.warning("OSRA errors:") for page, error in errors: eprint("\tError on page {}:".format(page)) eprint("\n\t\t".join("\n{}".format(error).splitlines())) return to_return if not format_output: if output_file: with open(output_file, mode="w", encoding="utf-8") as f: f.write("\n".join(to_return["stdout"])) return to_return output_cols = OrderedDict([("bond_length", 1), ("resolution", 2), ("confidence", 3), ("page", 4), ("coordinates", 5)]) if osra_output_format in osra_smiles_outputs: compound_template_dict = OrderedDict.fromkeys( output_formats + list(output_cols.keys())) else: compound_template_dict = OrderedDict.fromkeys(["page"] + output_formats) if any(to_return["stdout"]): if standardize_mols: standardizer = Standardizer() compounds = [] if is_output_sdf: if sdf_append: if not os.path.isfile(output_file_sdf): open(output_file_sdf, mode="w", encoding="utf-8").close() writer = SDWriter( open(output_file_sdf, mode="a", encoding="utf-8")) else: writer = SDWriter(output_file_sdf) for output, page in zip(to_return["stdout"], to_return["pages"]): if osra_output_format in osra_smiles_outputs: lines = [x.strip() for x in output.split("\n") if x] else: lines = [x for x in output.split("$$$$") if x.strip()] for line in lines: """ # so much problems with --learn # we can't simply split output by " " when --learn is present, because its output is like "1,2,2,2 1" if "learn" in filtered_cols: learn_start = filtered_cols.index("learn") + 1 # "smiles" col isn't in output_cols learn_end = filtered_cols.index("learn") + 1 + 3 line[learn_start:learn_end] = [" ".join(line[learn_start:learn_end])] """ if not line: continue if osra_output_format in osra_smiles_outputs: line = [x.strip() for x in line.split()] if custom_page: line[output_cols["page"]] = custom_page elif use_gm: line[output_cols["page"]] = page mol = MolFromSmiles( line[0], sanitize=False if standardize_mols else True) elif osra_output_format == "sdf": line = "\n" + line.strip() mol = MolFromMolBlock( line, strictParsing=False, sanitize=False if standardize_mols else True, removeHs=False if standardize_mols else True) if mol: compound = compound_template_dict.copy() if standardize_mols: try: mol = standardizer.standardize(mol) except ValueError as e: self.logger.warning( "Cannot standardize '{}': {}".format( MolToSmiles(mol), str(e))) for f in output_formats: if f == "smiles": compound["smiles"] = MolToSmiles( mol, isomericSmiles=True) elif f == "smiles_osra" and osra_output_format == "smi": compound["smiles_osra"] = line[0] elif f == "smiles_can_osra" and osra_output_format == "can": compound["smiles_can_osra"] = line[0] elif f == "inchi": inchi = MolToInchi(mol) if inchi: compound["inchi"] = inchi else: compound["inchi"] = "" self.logger.warning( "Cannot convert to InChI: {}".format( MolToSmiles(mol))) elif f == "inchikey": inchi = MolToInchi(mol) if inchi: compound["inchikey"] = InchiToInchiKey( inchi) else: compound["inchikey"] = "" self.logger.warning( "Cannot create InChI-key from InChI: {}" .format(MolToSmiles(mol))) elif f == "sdf": compound["sdf"] = MolToMolBlock( mol, includeStereo=True) elif f == "sdf_osra": compound["sdf_osra"] = line if is_output_sdf: writer.write(mol) if osra_output_format in osra_smiles_outputs: compound.update([(x[0], x[1]) for x in zip( list(output_cols.keys()), line[1:])]) else: compound[ "page"] = page if use_gm else custom_page if custom_page else 1 compounds.append(compound) else: self.logger.warning("Cannot convert to RDKit mol: " + line[0]) if is_output_sdf_osra: with open(output_file_sdf + "-osra.sdf", mode="w", encoding="utf-8") as f: f.write("".join(to_return["stdout"])) to_return["content"] = sorted(compounds, key=lambda x: x["page"]) if annotate: chemspider = ChemSpider( chemspider_token) if chemspider_token else None for i, ent in enumerate(to_return["content"]): self.logger.info("Annotating entity {}/{}...".format( i + 1, len(to_return["content"]))) ent.update( OrderedDict([("pch_cids_by_inchikey", ""), ("chs_cids_by_inchikey", ""), ("pch_cids_by_smiles", ""), ("chs_cids_by_smiles", ""), ("pch_cids_by_inchi", ""), ("chs_cids_by_inchi", ""), ("pch_iupac_name", ""), ("chs_common_name", ""), ("pch_synonyms", "")])) results = [] # prefer InChI key if "inchikey" in ent and ent["inchikey"]: try: results = get_compounds(ent["inchikey"], "inchikey") if results: if len(results) == 1: result = results[0] synonyms = result.synonyms if synonyms: ent["pch_synonyms"] = "\"{}\"".format( "\",\"".join(synonyms)) ent["pch_iupac_name"] = result.iupac_name ent["pch_cids_by_inchikey"] = "\"{}\"".format( ",".join([str(c.cid) for c in results])) except (BadRequestError, NotFoundError, PubChemHTTPError, ResponseParseError, ServerError, TimeoutError, PubChemPyError): pass results = chemspider.search( ent["inchikey"]) if chemspider_token else [] if results: if len(results) == 1: result = results[0] ent["chs_common_name"] = result.common_name ent["chs_cids_by_inchikey"] = "\"{}\"".format( ",".join([str(c.csid) for c in results])) else: for search_field, col_pch, col_chs in [ ("smiles", "pch_cids_by_smiles", "chs_cids_by_smiles"), ("inchi", "pch_cids_by_inchi", "chs_cids_by_inchi") ]: results_pch = [] results_chs = [] if search_field == "smiles" and "smiles" in ent and ent[ "smiles"] and "*" not in ent["smiles"]: try: results_pch = get_compounds( ent["smiles"], "smiles") except (BadRequestError, NotFoundError, PubChemHTTPError, ResponseParseError, ServerError, TimeoutError, PubChemPyError): pass results_chs = chemspider.search( ent["smiles"]) if chemspider_token else [] elif search_field == "inchi" and "inchi" in ent and ent[ "inchi"]: try: results_pch = get_compounds( ent["inchi"], "inchi") except (BadRequestError, NotFoundError, PubChemHTTPError, ResponseParseError, ServerError, TimeoutError, PubChemPyError): pass results_chs = chemspider.search( ent["inchi"]) if chemspider_token else [] if results_pch: ent[col_pch] = "\"{}\"".format(",".join( [str(c.cid) for c in results_pch])) if results_chs: ent[col_chs] = "\"{}\"".format(",".join( [str(c.csid) for c in results_chs])) sleep(0.5) if output_file: dict_to_csv(to_return["content"], output_file=output_file, csv_delimiter=csv_delimiter, write_header=write_header) if is_output_sdf: writer.close() elif not any(to_return["stdout"]) and output_file: write_empty_file(output_file, csv_delimiter=csv_delimiter, header=list(compound_template_dict.keys()), write_header=write_header) return to_return
class SelectCLI(object): """Select scaffolds using a molecular query on an aggregated TSV (CLI).""" def __init__(self, args): self.args = args self.q_input = args.input_query self.g_input = open(args.input_graph, 'r') if args.sdf: rdlogger.setLevel(4) self.output = SDWriter(args.output) else: self.output = open(args.output, 'w') self.query = set() self.matching_parents = set() self.count = 0 def select(self): if not self.args.sdf: self.output.write('ID\tHIERARCHY\tSMILES\tSUBSCAFFOLDS\n') self.load_query() logger.info('Processing query...') reader = ScaffoldFileIterator(self.g_input, reverse=True) for scaffold in reader: match = False if scaffold.smiles in self.query: match = True if scaffold.id in self.matching_parents: match = True if match is True: self.count += 1 self.write_scaffold(scaffold) for s in scaffold.subscaffolds: self.matching_parents.add(s.id) def load_query(self): logger.info('Reading molecular query...') file = None fmt = file_format(self.q_input) if fmt[0] == 'SMI': supplier = smiles.read_smiles_file(self.q_input) elif fmt[0] == 'SDF': rdlogger.setLevel(4) file = open(self.q_input, 'rb') supplier = sdf.read_sdf(file) else: raise ValueError('input file format not currently supported') for molecule in supplier: if molecule is not None: s = get_murcko_scaffold(molecule) self.query.add(MolToSmiles(s)) if file is not None: file.close() logger.info(f'Read {len(self.query)} query scaffolds') def write_scaffold(self, scaffold): subscaffolds = ', '.join([str(s.id) for s in scaffold.subscaffolds]) if self.args.sdf: molecule = MolFromSmiles(scaffold.smiles) if molecule is not None: molecule.SetProp('_Name', str(scaffold.id)) molecule.SetIntProp('HIERARCHY', scaffold.hierarchy) molecule.SetProp('SMILES', scaffold.smiles) molecule.SetProp('SUBSCAFFOLDS', subscaffolds) self.output.write(molecule) else: logger.warning(f'Failed to parse scaffold: {scaffold.smiles}') else: self.output.write('{0}\t{1}\t{2}\t{3}\n'.format( scaffold.id, scaffold.hierarchy, scaffold.smiles, subscaffolds)) def __enter__(self): return self def __exit__(self, exc_type, exc_value, traceback): self.g_input.close() self.output.close()