class Source(object): """This class provides methods for importing sources and maintaining source information in mess.db and in the molecule directories. Attributes: db (obj): A MessDB object log (obj): A Log('all') object source_dir (str): A path to the source directory id (int): The source_id in the mess.db source table name (str): A name for the source dirname (str): The name of the source subdirectory in the 'sources' directory url (str): Url where the source can be downloaded url_template (str): A url template that can be used to go to the web page for a particular molecule in a source catalog last_update (str): Date when source was last downloaded """ def __init__(self): """Initialize db cursor. Args: db (obj): A MessDB object """ self.db = MessDB() self.log = Log('all') self.source_dir = None self.id = None self.name = None self.dirname = None self.url = None self.url_template = None self.last_update = None @classmethod def get_sources(cls): """Returns a dictionary of source basenames mapped to source paths.""" source_dir = os.path.join(os.path.dirname(__file__), '../sources/') sources = {} for root, _, filenames in os.walk(source_dir): for filename in [f for f in filenames if f.endswith('.ini') and not (f.startswith('.') or f.startswith('~'))]: ini_basename = os.path.splitext(filename)[0] if ini_basename == os.path.basename(root): sources[ini_basename] = os.path.abspath(root) return sources def files(self): """Returns a list of importable files in the source directory.""" return [f for f in os.listdir(self.source_dir) if not (f.startswith('.') or f.startswith('~') or f.endswith('.txt')) and f.split('.')[-1] in pybel.informats] def setup(self, source): """Setup source in mess.db. Args: source: A path to a source directory or a source basename. """ source_basename = os.path.basename(source.rstrip(os.sep)) if source_basename.endswith('.ini'): source_basename = os.path.splitext(source_basename)[0] sources = self.get_sources() if source_basename not in sources: sys.exit(("All sources must reside in the 'sources' directory, " 'read sources/SOURCES_README.md for details.')) self.source_dir = sources[source_basename] source_ini = os.path.join(self.source_dir, '%s.ini' % source_basename) source_attributes = self.parse_ini(source_ini) # insert/update source in the database total_changes = self.db.total_changes insert_query = ('INSERT OR IGNORE INTO source ' '(name, dirname, url, url_template, ' 'citation, last_update) ' 'VALUES (?, ?, null, null, null, null)') update_query = ('UPDATE source ' 'SET url=?, url_template=?, citation=?, last_update=? ' 'WHERE dirname=?;') self.db.execute(insert_query, (source_attributes['name'], source_basename)) self.db.execute(update_query, (source_attributes['url'], source_attributes['url_template'], source_attributes['citation'], source_attributes['last_update'], source_basename)) if self.db.total_changes - total_changes > 1: self.log.info('%s added to sources in database', source_basename) select_query = ('SELECT source_id, name, dirname, ' 'url, url_template, last_update ' 'FROM source WHERE dirname=?') source_row = self.db.execute(select_query, (source_basename,)).fetchone() # set attributes self.id = source_row.source_id self.name = source_row.name self.dirname = source_row.dirname self.url = source_row.url self.url_template = source_row.url_template self.last_update = source_row.last_update def parse_ini(self, ini): """Parse source ini and return attributes.""" source_attributes = { 'url': None, 'url_template': None, 'citation': None } config = cp.ConfigParser(dict_type=CaseInsensitiveDict) config.read(ini) for section in config.sections(): for option in config.options(section): source_attributes[option] = unicode_replace(config.get(section, option)) required_attributes = ('name', 'last_update') if not all(att in source_attributes for att in required_attributes): sys.exit('Source INI missing required attributes: %s.' % ' and/or '.join(required_attributes)) return source_attributes def update_molecule_source_query(self, inchikey, identifier): """Update the source in mess.db. Args: inchikey: A molecule InChIKey. identifier: A source identifier (usually a catalog number). """ query = ('INSERT OR IGNORE INTO molecule_source ' '(inchikey, source_id, identifier) ' 'VALUES (?, ?, ?)') return (query, (inchikey, self.id, identifier)) def update_source_tsv(self, inchikey, identifier): """Update the sources.tsv file. Args: inchikey_dir: Dir to a molecule in the molecules dir. identifier: A source identifier (usually a catalog number). """ inchikey_dir = get_inchikey_dir(inchikey) name = self.name.encode('ascii', 'replace') dirname = self.dirname.encode('ascii', 'replace') identifier = identifier.encode('ascii', 'replace') sources_tsv = os.path.join(inchikey_dir, '%s.sources.tsv' % inchikey) with codecs.open(sources_tsv, 'r', 'ascii') as sources_in: with codecs.open(sources_tsv, 'a', 'ascii') as sources_out: sources_in = csv.reader(sources_in, delimiter=b'\t') sources_out = csv.writer(sources_out, delimiter=b'\t') # check if source has been recorded source_present = False for row in sources_in: try: if row[1] == dirname and row[2] == identifier: source_present = True except IndexError: pass if not source_present: if self.url_template: url_split = re.split(r"\[|\]", self.url_template) (match, replace) = re.split(r",\s?", url_split[1]) url_identifier = re.sub(match, replace, identifier) source_url = url_split[0] + url_identifier if 2 < len(url_split): source_url += url_split[2] else: source_url = '' sources_out.writerow([name, dirname, identifier, source_url.encode('ascii', 'replace')]) self.log.inchikey = inchikey self.log.info('%s added to %s sources', name, inchikey) self.log.inchikey = None
class Check(AbstractTool): """This tool verifies that data in the database and the molecules directory is self-consistent. """ def __init__(self): """Set description of tool.""" self.description = 'Check integrity of db/mess.db and molecules dir' self.epilog = '' def subparse(self, subparser): """Set tool-specific argparse arguments.""" pass # no arguments def execute(self, args): """Run self checks.""" self.db = MessDB() cur = self.db.execute('SELECT inchikey FROM molecule') self.db_inchikeys = set() # check that inchikeys are all valid for result in cur: if is_inchikey(result.inchikey, enforce_standard=True): self.db_inchikeys.add(result.inchikey) else: self.log_console.warning(('%s is not a valid standard ' 'InChiKey!'), result.inchikey) self.check_dir_structure() self.check_db_structure() self.check_db_dir_inchikey_concordance() self.summary() def check_db_dir_inchikey_concordance(self): """Check concordance between inchikey dirs and mess.db.""" # get list of inchikeys from molecules/ dir dir_inchikeys = set() inchis = glob.glob(os.path.join(os.path.dirname(__file__), '../../molecules/*/*/*/', '*.inchi')) for i in inchis: s = i.split('/')[-1] dir_inchikeys.add(s.split('.')[0]) # compare inchikeys from db vs dir in_db_not_dir = self.db_inchikeys - dir_inchikeys in_db_msg = '%d InChIKeys in mess.db that are not in molecules dir' if len(in_db_not_dir) > 0: self.log_console.warning(in_db_msg, len(in_db_not_dir)) print('\n'.join(i for i in in_db_not_dir), file=sys.stderr) in_dir_not_db = dir_inchikeys - self.db_inchikeys in_dir_msg = '%d InChIKeys in molecules dir that are not in mess.db' if len(in_dir_not_db) > 0: self.log_console.warning(in_dir_msg, len(in_db_not_dir)) print('\n'.join(i for i in in_dir_not_db), file=sys.stderr) def check_dir_structure(self): """Check that the structure of the molecules dir is consistent.""" moldir = os.path.join(os.path.dirname(__file__), '../../molecules') for l in os.listdir(moldir): lp = os.path.join(moldir, l) if not os.path.isdir(lp): if 'README' not in l and not l.startswith('.'): self.log_console.warning(('Unexpected file in ' 'molecules dir: %s'), l) continue if not len(l) == 1: self.log_console.warning(('Unexpected dir in ' 'molecules dir: %s'), l) continue for ll in os.listdir(lp): llp = os.path.join(moldir, l, ll) if not os.path.isdir(llp): self.log_console.warning(('Unexpected file ' 'in molecules dir: ' '%s/%s'), l, ll) continue if not (len(ll) == 2 and ll.isalpha()): self.log_console.warning(('Unexpected dir ' 'in molecules dir: ' '%s/%s'), l, ll) continue for lll in os.listdir(llp): lllp = os.path.join(moldir, l, ll, lll) if not os.path.isdir(lllp): self.log_console.warning(('Unexpected file ' 'in molecules dir: ' '%s/%s/%s'), l, ll, lll) continue if not is_inchikey(l + ll + lll, enforce_standard=True): self.log_console.warning(('Unexpected dir ' 'in molecules dir: ' '%s/%s/%s'), l, ll, lll) continue self.check_molecule_dir(l + ll + lll, lllp) def check_molecule_dir(self, inchikey, d): """Check that molecule directory has the proper file in it.""" l = os.listdir(d) if not inchikey + '.inchi' in l: self.log_console.warning('%s does not contain inchi', inchikey) if not inchikey + '.log' in l: self.log_console.warning('%s does not contain log file', inchikey) if not inchikey + '.notes' in l: self.log_console.warning('%s does not contain notes', inchikey) if not inchikey + '.png' in l: self.log_console.warning('%s does not contain png', inchikey) if 'sources.tsv' not in l: self.log_console.warning(('%s does not contain ' 'sources.tsv file'), inchikey) else: self.check_sources_tsv(os.path.join(d, 'sources.tsv')) for ll in l: if os.path.isdir(ll): self.check_method_dir(os.path.join(d, ll)) def check_sources_tsv(self, s): """Check that sources.tsv files are consistent with mess.db.""" pass def check_method_dir(self, d): """Check that method dirs are consistent with mess.db.""" pass def check_db_structure(self): """Check that DB structure/data is self-consistent.""" # check inchikey foreign keys molecule_synonym_inchikeys = set() q = 'SELECT DISTINCT inchikey FROM molecule_synonym' for r in self.db.execute(q).fetchall(): molecule_synonym_inchikeys.add(r.inchikey) molecule_source_inchikeys = set() q = 'SELECT DISTINCT inchikey FROM molecule_source' for r in self.db.execute(q).fetchall(): molecule_source_inchikeys.add(r.inchikey) q = 'SELECT DISTINCT inchikey FROM molecule_method_property' molecule_method_property_inchikeys = set() for r in self.db.execute(q).fetchall(): molecule_method_property_inchikeys.add(r.inchikey) loose_keys_mol_syn = molecule_synonym_inchikeys - self.db_inchikeys loose_keys_mol_src = molecule_source_inchikeys - self.db_inchikeys loose_keys_mol_met_prp = (molecule_method_property_inchikeys - self.db_inchikeys) if len(loose_keys_mol_syn) > 0: self.log_console.warning(('%d loose InChIKeys in ' 'molecule_synonym table'), len(loose_keys_mol_syn)) print('\n'.join(i for i in loose_keys_mol_syn), file=sys.stderr) if len(loose_keys_mol_src) > 0: self.log_console.warning(('%d loose InChIKeys in ' 'molecule_source table'), len(loose_keys_mol_src)) print('\n'.join(i for i in loose_keys_mol_src), file=sys.stderr) if len(loose_keys_mol_met_prp) > 0: self.log_console.warning(('%d loose InChIKeys in ' 'molecule_method_property table'), len(loose_keys_mol_met_prp)) print('\n'.join(i for i in loose_keys_mol_met_prp), file=sys.stderr) # check that sources in db exist in sources dir q = 'SELECT source_id, dirname FROM source' source_ids = set() source_path = os.path.join(os.path.dirname(__file__), '../../sources') for r in self.db.execute(q).fetchall(): source_ids.add(r.source_id) if not os.path.isdir(os.path.join(source_path, r.dirname)): self.log_console.warning('%s not in sources directory', r.dirname) # check source foreign keys molecule_source_ids = set() q = 'SELECT DISTINCT source_id FROM molecule_source' for r in self.db.execute(q).fetchall(): molecule_source_ids.add(r.source_id) loose_source_ids = molecule_source_ids - source_ids if len(loose_source_ids) > 0: self.log_console.warning(('%d loose source_ids in ' 'molecule_source table'), len(loose_source_ids)) print('\n'.join(i for i in loose_source_ids), file=sys.stderr) # check program foreign keys program_ids = set() q = 'SELECT program_id FROM program' for r in self.db.execute(q).fetchall(): program_ids.add(r.program_id) method_program_ids = set() q = 'SELECT DISTINCT program_id FROM method' for r in self.db.execute(q).fetchall(): method_program_ids.add(r.program_id) loose_program_ids = method_program_ids - program_ids if len(loose_program_ids) > 0: self.log_console.warning(('%d loose program_ids in ' 'method table'), len(loose_program_ids)) print('\n'.join(i for i in loose_program_ids), file=sys.stderr) # check parameter foreign keys parameter_ids = set() q = 'SELECT parameter_id FROM parameter' for r in self.db.execute(q).fetchall(): parameter_ids.add(r.parameter_id) method_parameter_ids = set() q = 'SELECT DISTINCT parameter_id FROM method_parameter' for r in self.db.execute(q).fetchall(): method_parameter_ids.add(r.parameter_id) method_tag_parameter_ids = set() q = 'SELECT DISTINCT parameter_id FROM method_tag' for r in self.db.execute(q).fetchall(): method_tag_parameter_ids.add(r.parameter_id) loose_m_pids = method_parameter_ids - parameter_ids if len(loose_m_pids) > 0: self.log_console.warning(('%d loose parameter_ids in ' 'method_parameter table'), len(loose_m_pids)) print('\n'.join(i for i in loose_m_pids), file=sys.stderr) loose_mt_pids = method_tag_parameter_ids - parameter_ids if len(loose_mt_pids) > 0: self.log_console.warning(('%d loose parameter_ids in ' 'method_tag table'), len(loose_mt_pids)) print('\n'.join(i for i in loose_mt_pids), file=sys.stderr) property_ids = set() q = 'SELECT property_id FROM property' for r in self.db.execute(q).fetchall(): property_ids.add(r.property_id) mmp_property_ids = set() q = 'SELECT DISTINCT property_id FROM molecule_method_property' for r in self.db.execute(q).fetchall(): mmp_property_ids.add(r.property_id) loose_mmp_property_ids = mmp_property_ids - property_ids if len(loose_mmp_property_ids) > 0: self.log_console.warning(('%d loose property_ids in ' 'method_tag table'), len(loose_mmp_property_ids)) print('\n'.join(i for i in loose_mmp_property_ids), file=sys.stderr) # check that methods in db exist in methods dir method_ids = set() q = 'SELECT method_id, name FROM method' for r in self.db.execute(q).fetchall(): method_ids.add(r.method_id) # check method foreign keys method_parameter_mids = set() q = 'SELECT DISTINCT method_id FROM method_parameter' for r in self.db.execute(q).fetchall(): method_parameter_mids.add(r.method_id) method_edge_mids = set() q = ('SELECT DISTINCT parent_method_id, child_method_id ' 'FROM method_edge') for r in self.db.execute(q).fetchall(): method_edge_mids.add(r.parent_method_id) method_edge_mids.add(r.child_method_id) loose_method_parameter_mids = method_parameter_mids - method_ids if len(loose_method_parameter_mids) > 0: self.log_console.warning(('%d loose method_ids in ' 'method_parameter table'), len(loose_method_parameter_mids)) print('\n'.join(i for i in loose_method_parameter_mids), file=sys.stderr) loose_method_edge_mids = method_edge_mids - method_ids if len(loose_method_edge_mids) > 0: self.log_console.warning(('%d loose method_ids in ' 'method_edge table'), len(loose_method_edge_mids)) print('\n'.join(i for i in loose_method_edge_mids), file=sys.stderr) # check edge foreign keys method_edge_ids = set() q = 'SELECT method_edge_id FROM method_edge' for r in self.db.execute(q).fetchall(): method_edge_ids.add(r.method_edge_id) method_path_edge_ids = set() q = 'SELECT DISTINCT method_edge_id FROM method_path_edge' for r in self.db.execute(q).fetchall(): method_path_edge_ids.add(r.method_edge_id) loose_method_path_edge_ids = method_path_edge_ids - method_edge_ids if len(loose_method_path_edge_ids) > 0: self.log_console.warning(('%d loose method_edge_ids in ' 'method_path_edge table'), len(loose_method_path_edge_ids)) print('\n'.join(i for i in loose_method_path_edge_ids), file=sys.stderr) # check path foreign keys method_path_ids = set() q = 'SELECT method_path_id FROM method_path' for r in self.db.execute(q).fetchall(): method_path_ids.add(r.method_path_id) method_path_edge_pids = set() q = 'SELECT DISTINCT method_path_id FROM method_path_edge' for r in self.db.execute(q).fetchall(): method_path_edge_pids.add(r.method_path_id) loose_method_path_edge_pids = method_path_edge_pids - method_path_ids if len(loose_method_path_edge_pids) > 0: self.log_console.warning(('%d loose method_path_ids in ' 'method_path_edge table'), len(loose_method_path_edge_pids)) print('\n'.join(i for i in loose_method_path_edge_pids), file=sys.stderr) # check edge closure # check path completeness, connectedness, and length concordance def summary(self): """Print summary statistics about molecules in MESS.DB.""" self.log_console.info('%d molecules in MESS.DB', len(self.db_inchikeys))
def execute(self, args): """Match molecules to SMARTS patterns.""" if args.inchikeys.name == '<stdin>' and args.inchikeys.isatty(): sys.exit('No input specified.') # parse args if not (args.smarts or args.fingerprint or args.spectrophore): sys.exit('No operations were selected, nothing to match.') if sum(bool(arg) for arg in (args.smarts, args.fingerprint, args.spectrophore)) > 1: sys.exit(('One thing at a time, please. The arguments --smarts, ' '--fingerprint, and --spectrophore are mutually ' 'exclusive.')) if args.smarts and args.target: self.log_console.warning(('--target ignored, proceeding with ' 'SMARTS matching')) if args.spectrophore: if args.path is None: sys.exit(('Spectrophore calculation requires 3D geometry. ' 'You must specify a 3D geometry with --path.')) else: path = MethodPath() path.set_path(args.path) method_dir = path.get_path_directory() sp_args = {'normalization': args.spectrophore_normalization, 'accuracy': args.spectrophore_accuracy, 'stereo': args.spectrophore_stereospecificity, 'resolution': args.spectrophore_resolution} # load target and target fingerprints target_mol = None target_fp = None target_sp = None if args.target: if os.path.exists(args.target): target_mol = pybel.readfile(args.target.split('.')[-1], args.target).next() else: target_mol = pybel.readstring('smi', args.target) if target_mol is not None: if args.fingerprint: target_fp = self.calculate_fingerprint(target_mol, args.fingerprint) if args.spectrophore: target_sp = self.calculate_spectrophore(target_mol, sp_args) # match every input db = MessDB() inchi_query = 'SELECT inchi FROM molecule WHERE inchikey = ?' fp_query = ('SELECT fingerprint FROM molecule_fingerprint ' 'WHERE inchikey = ? AND name = ? ' 'AND settings = ? AND method_path_id = ?') writer = csv.writer(sys.stdout, delimiter=args.delimiter) for row in args.inchikeys: inchikey = row.split()[0].strip() if args.smarts or args.fingerprint: inchi = db.execute(inchi_query, (inchikey,)).fetchone()[0] mol = pybel.readstring('inchi', 'InChI=%s' % inchi) if args.smarts: canonical = pybel.ob.OBOp.FindType(b"canonical") canonical.Do(mol.OBMol) for (smarts_obj, smarts_str) in self.smarts_generator(args.smarts): matches = [match for match in smarts_obj.findall(mol)] if len(matches) > 0: writer.writerow([inchikey, smarts_str] + matches) if args.fingerprint: try: fp = db.execute(fp_query, (inchikey, args.fingerprint, '', '')).fetchone()[0] except TypeError: fp = self.calculate_fingerprint(mol, args.fingerprint) if target_fp is not None: similarity = self.calculate_similarity(target_fp, fp, 'tanimoto') if similarity > args.cutoff: writer.writerow([inchikey, args.fingerprint, args.target, similarity]) else: writer.writerow([inchikey, args.fingerprint] + fp) if args.spectrophore: try: sp = db.execute(fp_query, (inchikey, 'Spectrophore', json.dumps(sp_args, sort_keys=True), args.path)).fetchone()[0] except TypeError: xyz_file = os.path.join(get_inchikey_dir(inchikey), method_dir, '%s.xyz' % inchikey) mol = pybel.readfile('xyz', xyz_file).next() sp = Match.calculate_spectrophore(mol, sp_args) if target_sp is not None: try: similarity = self.calculate_similarity(target_sp, sp, 'cos') except ValueError: similarity = 0 if similarity > args.cutoff: writer.writerow([inchikey, 'Spectrophore', args.target, similarity]) else: writer.writerow([inchikey, 'Spectrophore'] + sp)
class AbstractMethod(object): """All methods must inherit from this class. Attributes: db (obj): A MessDB object method_name (str): The name of the method description (str): Description of method geop (bool): Whether the method generates a new geometry prog_name (str): Program name prog_version (str): Program version prog_url (str): Program url parameters (dict): Parameters that affect program execution """ parameters = dict() shortdesc = None method_citation = None prog_citation = None _inchikey = None _path_id = None _parent_path_id = None _method_dir = None _parent_method_dir = None def __init__(self): """Set up db, check for attributes, dependencies, and setup.""" self.db = MessDB() self.path = MethodPath() self.log_console = Log('console') self.log_all = Log('all') self.method_name = self.get_method_name() try: self.parameters self.description self.geop # flag indicates method results in new xyz coordinates self.prog_name self.prog_version self.prog_url except AttributeError as err: print(''.join([str(err), '\n']), file=sys.stderr) sys.exit(('Each method class needs to define description, geop, ' 'prog_name, prog_version, prog_url, ' 'parameters as attributes.')) self.check_dependencies() def __hash__(self): """Hash based on method name and parameters. Returns: A hex string of the sha1 hash of self.method_name plus JSON-serialized self.parameters. Keys are sorted. """ return hashlib.sha1(self.method_name + json.dumps(dict((str(k).lower(), str(v).lower()) for k, v in self.parameters.iteritems()), sort_keys=True)).hexdigest() @property def hash(self): """Get hash.""" return self.__hash__() @property def method_id(self): """Get the object's method_id attribute.""" query = ('SELECT method_id FROM method ' 'WHERE hash = ?;') row = self.db.execute(query, (self.hash,)).fetchone() return row.method_id @property def path_id(self): """Get the path id of the method.""" if not self.path.get_method_id() == self.method_id: self._setup_path() return self._path_id @property def method_dir(self): """Get the directory name of the method.""" if not self.path.get_method_id() == self.method_id: self._setup_path() return self._method_dir @property def parent_method_dir(self): """Get the parent directory name of the method.""" if not self.path.get_method_id() == self.method_id: self._setup_path() return self._parent_method_dir @property def inchikey(self): """Get inchikey.""" return self._inchikey @inchikey.setter def inchikey(self, inchikey): """Set inchikey, and update inchikey of logger.""" if inchikey is not None and not is_inchikey(inchikey): raise RuntimeError('invalid inchikey: %s' % inchikey) self._inchikey = inchikey self.log_all.inchikey = inchikey @classmethod def get_method_name(cls): """Return the name of the method, derived from the subclass name.""" return cls.__name__.replace('_', '').lower() def _setup_path(self): """Setup path given current method id and parent path.""" self.path.setup_path(self.method_id, self._parent_path_id) self._path_id = self.path.get_path_id() self._method_dir = self.path.get_path_directory() self._parent_method_dir = self.path.get_parent_path_directory() def _insert_method(self): """Set insert program to db, set up hash, and insert method to db.""" total_changes = self.db.total_changes query = ('INSERT OR IGNORE INTO method ' '(program_id, geop, name, shortdesc, citation, hash) ' 'SELECT program.program_id, ?, ?, ?, ?, ? ' 'FROM program ' 'WHERE program.name=? AND program.version=?') self.db.execute(query, (self.geop, self.method_name, self.shortdesc, self.method_citation, self.hash, self.prog_name, self.prog_version)) if self.db.total_changes - total_changes > 0: self.log_all.info('new %s method added to MESS.DB', self.method_name) def _insert_program(self): """Adds row to program table in mess.db.""" total_changes = self.db.total_changes query = ('INSERT OR IGNORE INTO program ' '(name, version, url, citation) ' 'VALUES (?, ?, ?, ?)') self.db.execute(query, (self.prog_name, self.prog_version, self.prog_url, self.prog_citation)) if self.db.total_changes - total_changes > 0: self.log_all.info('program %s %s added to MESS.DB', self.prog_name, self.prog_version) def _insert_parameters(self): """Import paramaters dict to mess.db. Args: name: Name of parameter. setting: The value the parameter is set to. """ added_parameters = 0 for name, setting in self.parameters.items(): query = ('INSERT OR IGNORE INTO parameter (name) VALUES (?)') self.db.execute(query, (name, )) total_changes = self.db.total_changes query = ('INSERT OR IGNORE INTO method_parameter ' '(method_id, parameter_id, setting) ' 'SELECT ?, parameter.parameter_id, ? ' 'FROM program, parameter ' 'WHERE parameter.name=?') self.db.execute(query, (self.method_id, setting, name)) added_parameters += (self.db.total_changes - total_changes) if added_parameters > 0: self.log_all.info('%i method parameters added to MESS.DB', added_parameters) def get_insert_property_query(self, inchikey, name, description, format_, value, units=''): """Returns query to insert property value to mess.db. Args: inchikey: The inchikey of a molecule in MESS.DB. method_path_id: Path id for the calculations that generated the property. name: The property name. description: A description of the property. format_: A description of the format the property is in. value: The calculated property. units: Units for the property value. """ query = ('INSERT OR IGNORE INTO molecule_method_property_denorm ' 'VALUES (?, ?, ?, ?, ?, ?, ?);') return (query, (inchikey, self.path_id, name, description, format_, units, value)) def get_insert_moldata_queries(self, inchikey, mol, description='', units=''): """Returns queries to insert molecule data values to mess.db.""" for name, value in mol.data.iteritems(): yield self.get_insert_property_query(inchikey, name, description, type(value).__name__, value, units) def get_timing_query(self, inchikey, start): """Get a query to insert execution time property into db.""" return self.get_insert_property_query(inchikey, 'runtime', 'execution time', type(start).__name__, time.time() - start, 's') def set_parent_path(self, parent_path): """Set the parent path (e.g., path to method containing input geometry.)""" if parent_path > 0: self._parent_path_id = parent_path def has_parent_path(self, inchikey): """Returns True if molecule has had entire parent path calculated, False otherwise.""" query = ('SELECT inchikey FROM molecule_method_property WHERE ' 'inchikey = ? AND method_path_id = ?') try: self.db.execute(query, (inchikey, self._parent_path_id)).fetchone()[0] return True except TypeError: return False def check_dependencies(self): """If check_dependencies is not implemented, raise error.""" raise NotImplementedError(("every method needs a 'check_dependencies' " 'method')) def check(self): """If check is not implemented, raise error.""" # the check method should be called before a calculation (so # calculations are not repeated) and after (to verify success) raise NotImplementedError("every method needs a 'check' method") def map(self, inchikey, inchikey_dir): """Generally, maps molecule to calculation via method, emits query/value pairs. """ raise NotImplementedError(("every method needs a 'map' method")) def reduce(self, query, values): """Run queries/values on the db.""" total_changes = self.db.total_changes if query or values[0]: self.db.executemany(query, values) self.log_all.info('%i properties added to MESS.DB', self.db.total_changes - total_changes) total_changes = self.db.total_changes def setup(self): """Set up method.""" self._insert_program() self._insert_method() self._insert_parameters()
def execute(self, args): """Run select query, output table.""" if args.inchikeys.name == '<stdin>' and args.inchikeys.isatty(): filter_from = None else: try: filter_from = set(row.split()[0].strip() for row in args.inchikeys) except IndexError: filter_from = set([]) return if args.query and (args.property_name or args.property_operator or args.property_value): sys.exit(('Custom SQL queries are mutually exclusive with ' 'property filtering.')) if (args.part or args.of) and not (args.part and args.of): sys.exit(('If you specify a --part n, you must also specify --of ' 'N (e.g. something like --part 1 --of 5).')) if args.part and args.of: if args.part > args.of: sys.exit('--part must be smaller than --of.') if args.part < 1: sys.exit('--part must be >=1.') alpha = string.ascii_uppercase alpha3 = [''.join([a, b, c]) for a in alpha for b in alpha for c in alpha] # AAA to ZZZ if args.of > len(alpha3): sys.exit(('MESS.DB does not support subsetting into more than ' '%i parts.' % len(alpha3))) subsets = [alpha3[i::args.of] for i in xrange(args.of)] subset = subsets[args.part - 1] db = MessDB() cur = db.cursor() if args.query: try: cur.execute(codecs.open(args.query, encoding='utf-8').read()) except sqlite3.OperationalError: sys.exit("'%s' does not contain valid sql." % args.query) except IOError: try: cur.execute(args.query) except sqlite3.OperationalError: sys.exit(("'%s' is neither valid sql nor a path " 'to a file containing valid sql.') % args.query) elif (args.property_name and args.property_operator and args.property_value is not None): query, values = self.property_query(args.property_name, args.property_operator, args.property_value, args.path) cur.execute(query, values) else: cur.execute('SELECT inchikey FROM molecule') # check that sql returns inchikey in first column if not cur.description[0][0].lower() == 'inchikey': sys.exit('Query must return inchikey in first column.') # print table writer = csv.writer(sys.stdout, delimiter=args.delimiter) if args.headers: writer.writerow(list(h[0] for h in cur.description)) for result in cur: if filter_from is not None and result[0] not in filter_from: continue if args.regex_subset and not re.match(args.regex_subset, result[0], re.IGNORECASE): continue if args.part and args.of: if not any(result[0].startswith(a) for a in subset): continue if args.smarts: matches = 0 query = 'SELECT inchi FROM molecule WHERE inchikey = ?' inchi = db.execute(query, (result[0],)).fetchone()[0] mol = pybel.readstring('inchi', 'InChI=%s' % inchi) for (smarts_obj, smarts_str) in Match.smarts_generator(args.smarts): matches += len(smarts_obj.findall(mol)) if not matches: continue writer.writerow(list(xstr(v).decode('utf-8') for v in result)) db.close() # must be closed manually to prevent db locking during pipe
class Annotate(AbstractTool): """This tool annotates molecules with synonyms (common names, CAS, etc) and various fingerprints. """ def __init__(self): """Set description of tool.""" self.description = 'Annotate molecules with synonyms and fingerprints' self.epilog = '' def subparse(self, subparser): """Set tool-specific argparse arguments.""" subparser.add_argument('inchikeys', nargs='?', type=argparse.FileType('r'), default=sys.stdin, help=('a list of inchikeys (default: STDIN)')) subparser.add_argument('-c', '--cir', action='store_true', help=('get IUPAC names and other synonyms from ' 'the Chemical Information Resolver web ' 'service')) subparser.add_argument('-f', '--fingerprint', type=str, choices=[b'FP2', b'FP3', b'FP4', b'MACCS', b'MNA', b'MPD'], help=('calculate fingerprint')) subparser.add_argument('-s', '--spectrophore', action='store_true', help=('calculate Spectrophore ' 'descriptor/fingerprint; ' 'requires 3D geometry (i.e., you must ' 'set a path to a method that has ' 'generated xyz coordinates)')) subparser.add_argument('-p', '--path', type=int, default=None, help=('specify a path id, only used for ' 'Spectrophore')) sp_group = subparser.add_argument_group(('Spectrophore optional ' 'arguments')) sp_group.add_argument('-sn', '--spectrophore-normalization', type=str, default='No', choices=[b'No', b'ZeroMean', b'UnitStd', b'ZeroMeanAndUnitStd'], help=('perform normalization of Spectrophore')) sp_group.add_argument('-sa', '--spectrophore-accuracy', type=int, default=20, choices=[1, 2, 5, 10, 15, 20, 30, 36, 45, 60], help=('Spectrophore accuracy expressed as ' 'angular stepsize; lower is more accurate' 'but slower')) sp_group.add_argument('-ss', '--spectrophore-stereospecificity', type=str, default='No', choices=[b'No', b'Unique', b'Mirror', b'All'], help=('cage type in terms of the underlying ' 'pointgroup: P1 or P-1')) sp_group.add_argument('-sr', '--spectrophore-resolution', type=float, metavar='FLOAT', default=3.0, help=('required Spectrophore resolution in ' 'Angstroms')) def execute(self, args): """Match molecules to SMARTS patterns.""" if args.inchikeys.name == '<stdin>' and args.inchikeys.isatty(): sys.exit('No input specified.') if not (args.cir or args.fingerprint or args.spectrophore): sys.exit('You did not request any annotations.') if args.spectrophore: if args.path is None: sys.exit(('Spectrophore calculation requires 3D geometry. ' 'You must specify a 3D geometry with --path.')) else: path = MethodPath() path.set_path(args.path) method_dir = path.get_path_directory() sp_args = {'normalization': args.spectrophore_normalization, 'accuracy': args.spectrophore_accuracy, 'stereo': args.spectrophore_stereospecificity, 'resolution': args.spectrophore_resolution} self.db = MessDB() inchi_select_query = 'SELECT inchi FROM molecule WHERE inchikey = ?' fp_select_query = ('SELECT fingerprint FROM molecule_fingerprint ' 'WHERE inchikey = ? ' 'AND name = ? ' 'AND settings = ? ' 'AND method_path_id = ?') fp_insert_query = ('INSERT INTO molecule_fingerprint ' '(inchikey, name, settings, ' 'fingerprint, method_path_id) ' 'VALUES (?, ?, ?, ?, ?)') for row in args.inchikeys: self.inchikey = row.split()[0].strip() if args.cir: self.update_iupac(self.inchikey) self.update_synonyms(self.inchikey) if args.fingerprint: inchi = self.db.execute(inchi_select_query, (self.inchikey,)).fetchone()[0] mol = pybel.readstring('inchi', 'InChI=%s' % inchi) canonical = pybel.ob.OBOp.FindType(b'canonical') canonical.Do(mol.OBMol) fp = Match.calculate_fingerprint(mol, args.fingerprint) try: db_fp = self.db.execute(fp_select_query, (self.inchikey, args.fingerprint, '', '')).fetchone()[0] if not str(fp) == db_fp: self.log_console.warning(('new %s fingerprint ' 'for %s did not match ' 'fingerprint in db, ' 'db not updated'), args.fingerprint, self.inchikey) except TypeError: self.db.execute(fp_insert_query, (self.inchikey, args.fingerprint, '', str(fp), '')) self.log_all.info('%s fingerprint for %s added to db', args.fingerprint, self.inchikey) if args.spectrophore: xyz_file = os.path.join(get_inchikey_dir(self.inchikey), method_dir, '%s.xyz' % self.inchikey) mol = pybel.readfile('xyz', xyz_file).next() sp = Match.calculate_spectrophore(mol, sp_args) try: db_sp = self.db.execute(fp_select_query, (self.inchikey, 'Spectrophore', json.dumps(sp_args, sort_keys=True), args.path)).fetchone()[0] if not str(sp) == db_sp: self.log_console.warning(('new Spectrophore ' 'fingerprint for ' '%s did not match ' 'fingerprint in db, ' 'db not updated'), self.inchikey) except TypeError: json_sp_args = json.dumps(sp_args, sort_keys=True) self.db.execute(fp_insert_query, (self.inchikey, 'Spectrophore', json_sp_args, str(sp), args.path)) self.log_all.info(('Spectrophore fingerprint for %s ' 'with parameters %s and ' 'geometry from path %i ' 'added to db'), self.inchikey, json_sp_args, args.path) def update_synonyms(self, inchikey): """Get synonyms from CIR and load them into mess.db.""" new_synonyms = 0 synonyms = self.cir_request(inchikey, 'names') if synonyms: select_query = ('SELECT inchikey FROM molecule_synonym ' 'WHERE inchikey = ? AND name = ?') insert_query = ('INSERT INTO molecule_synonym (inchikey, name) ' 'VALUES (?, ?)') for synonym in synonyms.split('\n'): if self.db.execute(select_query, (inchikey, synonym)).fetchone() is None: self.db.execute(insert_query, (inchikey, synonym)) new_synonyms += 1 if new_synonyms > 0: if new_synonyms > 1: plural = 's' else: plural = '' self.log_all.info('%i new synonym%s for %s added', new_synonyms, plural, inchikey) def update_iupac(self, inchikey): """Get IUPAC name from CIR and load it into mess.db.""" iupacs = [] iupac = None new_synonyms = 0 try: iupacs = self.cir_request(inchikey, 'iupac_name').splitlines(True) # if multiple iupacs, take the longest (most specific) one iupac = max(iupacs, key=len).rstrip() except AttributeError: return if iupac is not None: iupac_select_query = ('SELECT iupac FROM molecule ' 'WHERE inchikey = ?') iupac_update_query = ('UPDATE molecule SET iupac = ? ' 'WHERE inchikey = ?') db_iupac = self.db.execute(iupac_select_query, (inchikey, )).fetchone()[0] if not db_iupac == iupac: self.db.execute(iupac_update_query, (iupac, inchikey)) self.log_all.info('iupac name for %s updated', inchikey) if len(iupacs) > 1: # if multiple, add others as synonym select_query = ('SELECT inchikey FROM molecule_synonym ' 'WHERE inchikey = ? AND name = ?') insert_query = ('INSERT INTO molecule_synonym ' '(inchikey, name) VALUES (?, ?)') for i in iupacs: if i != max(iupacs, key=len): # ignore longest iupac synonym = i.rstrip() if self.db.execute(select_query, (inchikey, synonym)).fetchone() is None: self.db.execute(insert_query, (inchikey, synonym)) new_synonyms += 1 if new_synonyms > 0: if new_synonyms > 1: plural = 's' else: plural = '' self.log_all.info('%i new synonym%s for %s added', new_synonyms, plural, inchikey) def cir_request(self, inchikey, representation): """Make request to CIR (Chemical Information Resolver). Args: inchikey: A valid InChIKey. representation: The representation desired from CIR. Returns: CIR's response, or None if there isn't one. """ url = 'http://cactus.nci.nih.gov/chemical/structure/%s/%s' %\ (inchikey, representation) headers = {'User-Agent': 'MESS.DB'} request = urllib2.Request(url, None, headers) try: response = urllib2.urlopen(request) if response.getcode() == 200: time.sleep(0.2) # protect cactus from hammering return response.read() except urllib2.URLError as err: if hasattr(err, 'reason'): reason = err.reason.lower() self.log_console.info('%s %s %s in cir', inchikey, representation, reason) return None