def create_molecules_from_smiles_file(file: BinaryIO) -> MoleculeSet: """ Reads in a .smiles file and constructs :class:`Molecule` instances from it, all linked to a single new :class:`MoleculeSet` instance. :param file: A file handle to a .smiles file. :returns: The newly created :class:`MoleculeSet` instance. The created :class:`Molecule` instances are available as its ``.molecules`` property. """ session = get_session() nof_mol = 0 molset = MoleculeSet() for line in file: # Rudimentary SMILES parsing line = line.decode('utf-8') if line.startswith('#'): continue line_contents = line.split(None, 1) if len(line_contents) == 2: pattern, name = line_contents else: pattern = line_contents[0] name = '' pattern, name = pattern.strip(), name.strip() molecule = Molecule(pattern=pattern, name=name, molset=molset) session.add(molecule) nof_mol += 1 session.commit() logging.info(f"Added {nof_mol} Molecules to the database.") return molset
def matches_for_molecule_set(id: int): """ A route that retrieves all matches associated with a MoleculeSet instance. Responds with a JSON object containing the ``molecule_set_id``, as well as an array of ``matches``. Each match in ``matches`` will have a ``molecule_id``, a ``molecule_name`` and a ``smarts_id``. :param id: The ID of the MoleculeSet instance. :return: JSON as described above. """ session = get_session() molset = session.query(MoleculeSet).get(id) if molset is None: return {'error': 'Unknown molecule set.'}, 404 molecule_ids = session.query(Molecule.id).filter_by(molset_id=molset.id) matches = session.query(Match).filter(Match.molecule_id.in_(molecule_ids)) return { 'molecule_set_id': molset.id, 'matches': [ { 'molecule_id': match.molecule_id, 'molecule_name': match.molecule.name, 'smarts_id': match.smarts_id } for match in matches ] }, 200
def upload_molecule_set(): """ A route for uploading a set of molecules, given as a single SMILES file in the 'file' parameter. On success, redirects to the route :func:`matches_for_molecule_set` for the newly created MoleculeSet. Therefore, on success this returns the matches associated with the uploaded molecule set. On failure, responds with a 400 error and JSON containing a descriptive error string (key 'error'). This string can be displayed directly in the frontend. """ if 'file' not in request.files or not request.files['file'].filename: return {'error': 'Request seems to be missing a molecule file.'}, 400 plain_file = request.files['file'] try: file = _check_valid_file(plain_file) except ValueError as e: return {'error': str(e)}, 400 mol_set = None try: mol_set = calculate_molecule_matches(file) draw_molecules_from_molset(mol_set) return redirect(url_for('molecules.matches_for_molecule_set', id=mol_set.id)) except Exception as e: logging.error(e) if mol_set is not None: session = get_session() session.delete(mol_set) session.commit() return {'error': 'Unknown error occurred'}, 500
def from_db(min_similarity: float, max_similarity: float) -> dict: """Generates a dict representation of all directed graph data stored in the database, consisting of all stored SMARTS nodes (key 'nodes') and those stored directed edges (key 'edges') whose spsim property fulfils (interval min_similarity <= spsim <= max_similarity). :param min_similarity: The minimum similarity of the returned edges (inclusive). :param max_similarity: The maximum similarity of the returned edges (exclusive). :return: A dict of the available graph data as described. """ session = get_session() smarts = session.query(SMARTS).all() edges = session.query(DirectedEdge).filter( DirectedEdge.spsim >= min_similarity, DirectedEdge.spsim <= max_similarity).all() graph_dict = { 'nodes': [{ 'id': smart.id, 'name': smart.name, 'library': smart.library, 'pattern': smart.pattern } for smart in smarts], 'edges': [{ 'id': edge.id, 'source': edge.from_id, 'target': edge.to_id, 'mcssim': edge.mcssim, 'spsim': edge.spsim } for edge in edges] } return graph_dict
def add_library_command(name, filename): """ Add a SMARTS library (name & .smarts file) to the db. """ session = get_session() if session.query(SMARTS).filter_by(library=name).count() > 0: answer = input( f"There are already SMARTS with this library name ({name}) in the database.\n" f"Are you sure you want to add them? Enter Y to continue [y/N] ") if answer.lower() != 'y': click.echo(f"{name} was *not* inserted.") return return add_library(name, filename)
def draw_all_smarts_command(): """ Draws all SMARTS in the db to the serving directory. This is a required action before serving the application in production, for the frontend to work correctly. """ import os session = get_session() all_smarts = session.query(SMARTS).all() viewer_path = current_app.config['SMARTSCOMPARE_VIEWER_PATH'] output_path = current_app.config['STATIC_SMARTSVIEW_PATH'] os.makedirs(output_path, exist_ok=True) if not os.path.isfile(viewer_path): raise ValueError( f"Viewer path {viewer_path} does not point to a file...!") return draw_multiple_smarts(all_smarts, viewer_path, output_path)
def deliver_molecule_image(id): """ A route that delivers the image for a molecule, given the molecule's ID. Responds with 404 if the molecule or its image could not be found. :param id: The ID of the molecule. :return: A file response on success, a 404 response on error. """ session = get_session() molecule = session.query(Molecule).get(id) if molecule is None: return {'error': 'Molecule not found'}, 404 subdir = secure_filename(str(molecule.molset_id)) filename = secure_filename(f'{molecule.id}.svg') return send_from_directory( os.path.join(current_app.config['STATIC_MOL2SVG_MOLECULE_SETS_PATH'], subdir), filename )
def add_library(name: str, filename: str) -> None: """ Add a SMARTS library (name and a .smarts file) to the db, by adding corresponding SMARTS objects to the database. Note that, if available, the SMARTS label will be used as the newly created SMARTS objects' names. This is typically a tab- or space-separated string that comes after each SMARTS pattern in the .smarts file. :param name: The name of the library to add. Will be stored on the created SMARTS instances. :param filename: The filename of the .smarts file to create and store SMARTS from. """ import re session = get_session() with open(filename, 'r') as stream: # TODO maybe some line documentation ignored_lines = [] nof_added_smarts = 0 for i, line in enumerate(stream): line = line.strip() if line.startswith('#'): continue m = re.search(r'(^[^\s]+)\s+(.+)$', line) if m: smarts_pattern = m.group(1) smarts_name = m.group(2) smarts = SMARTS(name=smarts_name, pattern=smarts_pattern, library=name) session.add(smarts) nof_added_smarts += 1 else: ignored_lines.append(i + 1) # take care of 0 indexing! session.commit() click.echo( f"Added {nof_added_smarts} SMARTS to the database as library {name}.") if ignored_lines: click.echo("Ignored lines: " + ", ".join(map(str, ignored_lines)))
def draw_all_subsets_command(): """ Draws all DirectedEdges in the db to the serving directory. This is a required action before serving the application in production, for the frontend to work correctly. """ import os session = get_session() all_edges = session.query(DirectedEdge).options( subqueryload(DirectedEdge.from_smarts), subqueryload(DirectedEdge.to_smarts)).all() viewer_path = os.path.join(current_app.root_path, current_app.config['SMARTSCOMPARE_VIEWER_PATH']) output_path = current_app.config['STATIC_SMARTSVIEW_SUBSETS_PATH'] os.makedirs(output_path, exist_ok=True) if not os.path.isfile(viewer_path): raise ValueError( f"Viewer path {viewer_path} does not point to a file...!") return draw_multiple_smarts_subset_relations(all_edges, viewer_path, output_path)
def calculate_molecule_matches( uploaded_molecules_file: BinaryIO) -> MoleculeSet: """ Calculate molecule matches of all SMARTS in the database given a molecule file, and store the Molecule and Match instances in the database. :param uploaded_molecules_file: An open file handle to a molecule file to match """ import tempfile import sys from smartsexplore.util import run_process moleculefile, smartsfile, moleculematchfile = [None] * 3 # get all SMARTS patterns in file mol_set = None try: session = get_session() mol_set = create_molecules_from_smiles_file(uploaded_molecules_file) moleculefile, _ = molecules_to_temporary_smiles_file(mol_set.molecules) try: smartsfile = write_smarts_to_tempfile() except NoSMARTSException: session.commit() return mol_set # Run moleculematch on the temporary SMARTS file, and write the # stdout to a new temporary result output file. moleculematchfile = tempfile.NamedTemporaryFile(mode='w+') match_cmd = [ current_app.config['MATCHTOOL_PATH'], '-i', '2', '-m', moleculefile.name, '-s', smartsfile.name ] run_process(match_cmd, stdout=moleculematchfile, stderr=sys.stderr, reraise_exceptions=True) moleculematchfile.seek(0) # Parse the moleculematch output parse_iterator = parse_moleculematch(moleculematchfile) # --- Code to store results in the database starts here --- for (smartsid, moleculeid) in parse_iterator: mmol = session.query(Molecule).get(moleculeid) msmarts = session.query(SMARTS).get(smartsid) newmatch = Match(molecule=mmol, smarts=msmarts) session.add(newmatch) # Commit the session session.commit() return mol_set except Exception as e: if mol_set is not None: # clean up molset if exception occurred session = get_session() session.delete(mol_set) session.commit() raise e finally: # close all open file handles if uploaded_molecules_file: uploaded_molecules_file.close() if moleculefile: moleculefile.close() if smartsfile: smartsfile.close() if moleculematchfile: moleculematchfile.close()
def calculate_edges(mode): """ Calculate and add edges between all SMARTS in the database. Currently implements modes 'Similarity' and 'SubsetOfFirst'. 'SubsetOfSecond' is redundant, and 'Identical' is currently just not implemented. When 'Similarity' mode is chosen, 0.1 is picked as a fixed similarity value lower bound; otherwise a too large number of edges for our purposes would (generally) be generated. """ import tempfile, os, sys from smartsexplore.util import run_process # Check validity of chosen mode implemented_modes = ('Similarity', 'SubsetOfFirst') if mode not in implemented_modes: raise ValueError( f"{mode} is not an implemented mode. Implemented modes are: " f"{', '.join(implemented_modes)}") # Get a DB session, retrieve all SMARTS patterns, and write them into file session = get_session() try: smartsfile = write_smarts_to_tempfile() except NoSMARTSException: logging.warning( "No SMARTS in the database! Exiting the edge calculation process..." ) # Get mode ID mode_map = { 'Identical': 1, 'SubsetOfFirst': 2, 'SubsetOfSecond': 3, 'Similarity': 4 } mode_id = mode_map[mode] # Run SMARTScompare on the temporary SMARTS file, and write the # stdout to a new temporary result output file. smartscomparefile = tempfile.NamedTemporaryFile(mode='w+') compare_cmd = [ current_app.config['SMARTSCOMPARE_PATH'], smartsfile.name, '-M', '-1', # discard edges with <= 0.1 similarity when using (undirected) mode "Similarity" *(['-t', '0.1'] if mode == 'Similarity' else []), '-p', str(os.cpu_count() // 2), '-d', '|', '-D', '`', '-m', str(mode_id) ] run_process(compare_cmd, stdout=smartscomparefile, stderr=sys.stderr) smartscomparefile.seek(0) # must rewind before further usage # Parse the SMARTScompare output parse_iterator = parse_smartscompare(smartscomparefile) resultfile_mode = next(parse_iterator) assert resultfile_mode == mode,\ f"Mode of the SMARTScompare output, {resultfile_mode}, does not match specified mode, {mode}!" # --- Code to store results in the database starts here --- # Get the existing edges in the database and store them in memory, for efficient checks existing_edges = _get_existing_edges(mode, session) nof_added_edges = 0 duplicate_edges = [] # Define a function to check for duplicates def _check_for_duplicates(l, r): if (l, r) in existing_edges: duplicate_edges.append((l, r)) return True else: return False # Different loops and logic based on mode if mode == 'Similarity': for (line_no, lname, rname, mcssim, spsim) in parse_iterator: lsmarts = session.query(SMARTS).filter_by(id=int(lname)).first() rsmarts = session.query(SMARTS).filter_by(id=int(rname)).first() losmarts, hismarts = (lsmarts, rsmarts) if lsmarts.id < rsmarts.id\ else (rsmarts, lsmarts) assert losmarts.id != hismarts.id, f' {lname} {lsmarts}\n== {rname} {rsmarts}' assert losmarts.id < hismarts.id if not _check_for_duplicates(losmarts.id, hismarts.id): edge = UndirectedEdge(low_smarts=losmarts, high_smarts=hismarts, mcssim=mcssim, spsim=spsim) existing_edges.add((losmarts.id, hismarts.id)) session.add(edge) nof_added_edges += 1 elif mode == 'SubsetOfFirst': for (line_no, lname, rname, mcssim, spsim) in parse_iterator: lsmarts = session.query(SMARTS).filter_by(id=int(lname)).first() rsmarts = session.query(SMARTS).filter_by(id=int(rname)).first() fromsmarts, tosmarts = rsmarts, lsmarts if not _check_for_duplicates(fromsmarts.id, tosmarts.id): edge = DirectedEdge(from_smarts=fromsmarts, to_smarts=tosmarts, mcssim=mcssim, spsim=spsim) existing_edges.add((fromsmarts.id, tosmarts.id)) session.add(edge) nof_added_edges += 1 # Commit the session and close all temporary files session.commit() smartsfile.close() smartscomparefile.close()