Пример #1
0
class Source(object):
    """This class provides methods for importing sources and maintaining source
    information in mess.db and in the molecule directories.
    
    Attributes:
        db (obj): A MessDB object
        log (obj): A Log('all') object
        source_dir (str): A path to the source directory
        id (int): The source_id in the mess.db source table
        name (str): A name for the source
        dirname (str): The name of the source subdirectory in the 'sources'
                       directory
        url (str): Url where the source can be downloaded
        url_template (str): A url template that can be used to go to the web
                            page for a particular molecule in a source catalog
        last_update (str): Date when source was last downloaded
    """
    def __init__(self):
        """Initialize db cursor.
        
        Args:
            db (obj): A MessDB object
        """
        self.db = MessDB()
        self.log = Log('all')
        self.source_dir = None
        self.id = None
        self.name = None
        self.dirname = None
        self.url = None
        self.url_template = None
        self.last_update = None
    
    @classmethod
    def get_sources(cls):
        """Returns a dictionary of source basenames mapped to source paths."""
        source_dir = os.path.join(os.path.dirname(__file__), '../sources/')
        sources = {}
        for root, _, filenames in os.walk(source_dir):
            for filename in [f for f in filenames
                             if f.endswith('.ini')
                             and not (f.startswith('.') or f.startswith('~'))]:
                ini_basename = os.path.splitext(filename)[0]
                if ini_basename == os.path.basename(root):
                    sources[ini_basename] = os.path.abspath(root)
        return sources
    
    def files(self):
        """Returns a list of importable files in the source directory."""
        return [f for f in os.listdir(self.source_dir)
                if not (f.startswith('.')
                        or f.startswith('~')
                        or f.endswith('.txt'))
                and f.split('.')[-1] in pybel.informats]
    
    def setup(self, source):
        """Setup source in mess.db.
        
        Args:
            source: A path to a source directory or a source basename.
        """
        source_basename = os.path.basename(source.rstrip(os.sep))
        if source_basename.endswith('.ini'):
            source_basename = os.path.splitext(source_basename)[0]
        sources = self.get_sources()
        if source_basename not in sources:
            sys.exit(("All sources must reside in the 'sources' directory, "
                      'read sources/SOURCES_README.md for details.'))
        self.source_dir = sources[source_basename]
        source_ini = os.path.join(self.source_dir, '%s.ini' % source_basename)
        source_attributes = self.parse_ini(source_ini)
        # insert/update source in the database
        total_changes = self.db.total_changes
        insert_query = ('INSERT OR IGNORE INTO source '
                        '(name, dirname, url, url_template, '
                        'citation, last_update) '
                        'VALUES (?, ?, null, null, null, null)')
        update_query = ('UPDATE source '
                        'SET url=?, url_template=?, citation=?, last_update=? '
                        'WHERE dirname=?;')
        self.db.execute(insert_query, (source_attributes['name'],
                                       source_basename))
        self.db.execute(update_query, (source_attributes['url'],
                                       source_attributes['url_template'],
                                       source_attributes['citation'],
                                       source_attributes['last_update'],
                                       source_basename))
        if self.db.total_changes - total_changes > 1:
            self.log.info('%s added to sources in database', source_basename)
        select_query = ('SELECT source_id, name, dirname, '
                        'url, url_template, last_update '
                        'FROM source WHERE dirname=?')
        source_row = self.db.execute(select_query,
                                     (source_basename,)).fetchone()
        # set attributes
        self.id = source_row.source_id
        self.name = source_row.name
        self.dirname = source_row.dirname
        self.url = source_row.url
        self.url_template = source_row.url_template
        self.last_update = source_row.last_update
    
    def parse_ini(self, ini):
        """Parse source ini and return attributes."""
        source_attributes = {
            'url': None,
            'url_template': None,
            'citation': None
        }
        config = cp.ConfigParser(dict_type=CaseInsensitiveDict)
        config.read(ini)
        for section in config.sections():
            for option in config.options(section):
                source_attributes[option] = unicode_replace(config.get(section,
                                                                       option))
        required_attributes = ('name', 'last_update')
        if not all(att in source_attributes for att in required_attributes):
            sys.exit('Source INI missing required attributes: %s.'
                     % ' and/or '.join(required_attributes))
        return source_attributes
    
    def update_molecule_source_query(self, inchikey, identifier):
        """Update the source in mess.db.
        
        Args:
            inchikey: A molecule InChIKey.
            identifier: A source identifier (usually a catalog number).
        """
        query = ('INSERT OR IGNORE INTO molecule_source '
                 '(inchikey, source_id, identifier) '
                 'VALUES (?, ?, ?)')
        return (query, (inchikey, self.id, identifier))
    
    def update_source_tsv(self, inchikey, identifier):
        """Update the sources.tsv file.
        
        Args:
            inchikey_dir: Dir to a molecule in the molecules dir.
            identifier: A source identifier (usually a catalog number).
        
        """
        inchikey_dir = get_inchikey_dir(inchikey)
        name = self.name.encode('ascii', 'replace')
        dirname = self.dirname.encode('ascii', 'replace')
        identifier = identifier.encode('ascii', 'replace')
        sources_tsv = os.path.join(inchikey_dir, '%s.sources.tsv' % inchikey)
        with codecs.open(sources_tsv, 'r', 'ascii') as sources_in:
            with codecs.open(sources_tsv, 'a', 'ascii') as sources_out:
                sources_in = csv.reader(sources_in, delimiter=b'\t')
                sources_out = csv.writer(sources_out, delimiter=b'\t')
                # check if source has been recorded
                source_present = False
                for row in sources_in:
                    try:
                        if row[1] == dirname and row[2] == identifier:
                            source_present = True
                    except IndexError:
                        pass
                if not source_present:
                    if self.url_template:
                        url_split = re.split(r"\[|\]", self.url_template)
                        (match, replace) = re.split(r",\s?", url_split[1])
                        url_identifier = re.sub(match, replace, identifier)
                        source_url = url_split[0] + url_identifier
                        if 2 < len(url_split):
                            source_url += url_split[2]
                    else:
                        source_url = ''
                    sources_out.writerow([name, dirname, identifier,
                                          source_url.encode('ascii',
                                                            'replace')])
                    self.log.inchikey = inchikey
                    self.log.info('%s added to %s sources', name, inchikey)
                    self.log.inchikey = None
Пример #2
0
class Check(AbstractTool):
    """This tool verifies that data in the database and the molecules directory
    is self-consistent.
    """
    
    def __init__(self):
        """Set description of tool."""
        self.description = 'Check integrity of db/mess.db and molecules dir'
        self.epilog = ''
    
    def subparse(self, subparser):
        """Set tool-specific argparse arguments."""
        pass  # no arguments
    
    def execute(self, args):
        """Run self checks."""
        self.db = MessDB()
        cur = self.db.execute('SELECT inchikey FROM molecule')
        self.db_inchikeys = set()
        # check that inchikeys are all valid
        for result in cur:
            if is_inchikey(result.inchikey, enforce_standard=True):
                self.db_inchikeys.add(result.inchikey)
            else:
                self.log_console.warning(('%s is not a valid standard '
                                          'InChiKey!'), result.inchikey)
        self.check_dir_structure()
        self.check_db_structure()
        self.check_db_dir_inchikey_concordance()
        self.summary()
    
    def check_db_dir_inchikey_concordance(self):
        """Check concordance between inchikey dirs and mess.db."""
        # get list of inchikeys from molecules/ dir
        dir_inchikeys = set()
        inchis = glob.glob(os.path.join(os.path.dirname(__file__),
                                        '../../molecules/*/*/*/', '*.inchi'))
        for i in inchis:
            s = i.split('/')[-1]
            dir_inchikeys.add(s.split('.')[0])
        # compare inchikeys from db vs dir
        in_db_not_dir = self.db_inchikeys - dir_inchikeys
        in_db_msg = '%d InChIKeys in mess.db that are not in molecules dir'
        if len(in_db_not_dir) > 0:
            self.log_console.warning(in_db_msg, len(in_db_not_dir))
            print('\n'.join(i for i in in_db_not_dir), file=sys.stderr)
        in_dir_not_db = dir_inchikeys - self.db_inchikeys
        in_dir_msg = '%d InChIKeys in molecules dir that are not in mess.db'
        if len(in_dir_not_db) > 0:
            self.log_console.warning(in_dir_msg, len(in_db_not_dir))
            print('\n'.join(i for i in in_dir_not_db), file=sys.stderr)
    
    def check_dir_structure(self):
        """Check that the structure of the molecules dir is consistent."""
        moldir = os.path.join(os.path.dirname(__file__), '../../molecules')
        for l in os.listdir(moldir):
            lp = os.path.join(moldir, l)
            if not os.path.isdir(lp):
                if 'README' not in l and not l.startswith('.'):
                    self.log_console.warning(('Unexpected file in '
                                              'molecules dir: %s'), l)
                continue
            if not len(l) == 1:
                self.log_console.warning(('Unexpected dir in '
                                          'molecules dir: %s'), l)
                continue
            for ll in os.listdir(lp):
                llp = os.path.join(moldir, l, ll)
                if not os.path.isdir(llp):
                    self.log_console.warning(('Unexpected file '
                                              'in molecules dir: '
                                              '%s/%s'), l, ll)
                    continue
                if not (len(ll) == 2 and ll.isalpha()):
                    self.log_console.warning(('Unexpected dir '
                                              'in molecules dir: '
                                              '%s/%s'), l, ll)
                    continue
                for lll in os.listdir(llp):
                    lllp = os.path.join(moldir, l, ll, lll)
                    if not os.path.isdir(lllp):
                        self.log_console.warning(('Unexpected file '
                                                  'in molecules dir: '
                                                  '%s/%s/%s'), l, ll, lll)
                        continue
                    if not is_inchikey(l + ll + lll, enforce_standard=True):
                        self.log_console.warning(('Unexpected dir '
                                                  'in molecules dir: '
                                                  '%s/%s/%s'), l, ll, lll)
                        continue
                    self.check_molecule_dir(l + ll + lll, lllp)
    
    def check_molecule_dir(self, inchikey, d):
        """Check that molecule directory has the proper file in it."""
        l = os.listdir(d)
        if not inchikey + '.inchi' in l:
            self.log_console.warning('%s does not contain inchi',
                                     inchikey)
        if not inchikey + '.log' in l:
            self.log_console.warning('%s does not contain log file',
                                     inchikey)
        if not inchikey + '.notes' in l:
            self.log_console.warning('%s does not contain notes',
                                     inchikey)
        if not inchikey + '.png' in l:
            self.log_console.warning('%s does not contain png',
                                     inchikey)
        if 'sources.tsv' not in l:
            self.log_console.warning(('%s does not contain '
                                      'sources.tsv file'), inchikey)
        else:
            self.check_sources_tsv(os.path.join(d, 'sources.tsv'))
        for ll in l:
            if os.path.isdir(ll):
                self.check_method_dir(os.path.join(d, ll))
    
    def check_sources_tsv(self, s):
        """Check that sources.tsv files are consistent with mess.db."""
        pass
    
    def check_method_dir(self, d):
        """Check that method dirs are consistent with mess.db."""
        pass
    
    def check_db_structure(self):
        """Check that DB structure/data is self-consistent."""
        # check inchikey foreign keys
        molecule_synonym_inchikeys = set()
        q = 'SELECT DISTINCT inchikey FROM molecule_synonym'
        for r in self.db.execute(q).fetchall():
            molecule_synonym_inchikeys.add(r.inchikey)
        molecule_source_inchikeys = set()
        q = 'SELECT DISTINCT inchikey FROM molecule_source'
        for r in self.db.execute(q).fetchall():
            molecule_source_inchikeys.add(r.inchikey)
        q = 'SELECT DISTINCT inchikey FROM molecule_method_property'
        molecule_method_property_inchikeys = set()
        for r in self.db.execute(q).fetchall():
            molecule_method_property_inchikeys.add(r.inchikey)
        loose_keys_mol_syn = molecule_synonym_inchikeys - self.db_inchikeys
        loose_keys_mol_src = molecule_source_inchikeys - self.db_inchikeys
        loose_keys_mol_met_prp = (molecule_method_property_inchikeys -
                                  self.db_inchikeys)
        if len(loose_keys_mol_syn) > 0:
            self.log_console.warning(('%d loose InChIKeys in '
                                      'molecule_synonym table'),
                                     len(loose_keys_mol_syn))
            print('\n'.join(i for i in loose_keys_mol_syn), file=sys.stderr)
        if len(loose_keys_mol_src) > 0:
            self.log_console.warning(('%d loose InChIKeys in '
                                      'molecule_source table'),
                                     len(loose_keys_mol_src))
            print('\n'.join(i for i in loose_keys_mol_src), file=sys.stderr)
        if len(loose_keys_mol_met_prp) > 0:
            self.log_console.warning(('%d loose InChIKeys in '
                                      'molecule_method_property table'),
                                     len(loose_keys_mol_met_prp))
            print('\n'.join(i for i in loose_keys_mol_met_prp),
                  file=sys.stderr)
        # check that sources in db exist in sources dir
        q = 'SELECT source_id, dirname FROM source'
        source_ids = set()
        source_path = os.path.join(os.path.dirname(__file__), '../../sources')
        for r in self.db.execute(q).fetchall():
            source_ids.add(r.source_id)
            if not os.path.isdir(os.path.join(source_path, r.dirname)):
                self.log_console.warning('%s not in sources directory',
                                         r.dirname)
        # check source foreign keys
        molecule_source_ids = set()
        q = 'SELECT DISTINCT source_id FROM molecule_source'
        for r in self.db.execute(q).fetchall():
            molecule_source_ids.add(r.source_id)
        loose_source_ids = molecule_source_ids - source_ids
        if len(loose_source_ids) > 0:
            self.log_console.warning(('%d loose source_ids in '
                                      'molecule_source table'),
                                     len(loose_source_ids))
            print('\n'.join(i for i in loose_source_ids), file=sys.stderr)
        # check program foreign keys
        program_ids = set()
        q = 'SELECT program_id FROM program'
        for r in self.db.execute(q).fetchall():
            program_ids.add(r.program_id)
        method_program_ids = set()
        q = 'SELECT DISTINCT program_id FROM method'
        for r in self.db.execute(q).fetchall():
            method_program_ids.add(r.program_id)
        loose_program_ids = method_program_ids - program_ids
        if len(loose_program_ids) > 0:
            self.log_console.warning(('%d loose program_ids in '
                                      'method table'),
                                     len(loose_program_ids))
            print('\n'.join(i for i in loose_program_ids), file=sys.stderr)
        # check parameter foreign keys
        parameter_ids = set()
        q = 'SELECT parameter_id FROM parameter'
        for r in self.db.execute(q).fetchall():
            parameter_ids.add(r.parameter_id)
        method_parameter_ids = set()
        q = 'SELECT DISTINCT parameter_id FROM method_parameter'
        for r in self.db.execute(q).fetchall():
            method_parameter_ids.add(r.parameter_id)
        method_tag_parameter_ids = set()
        q = 'SELECT DISTINCT parameter_id FROM method_tag'
        for r in self.db.execute(q).fetchall():
            method_tag_parameter_ids.add(r.parameter_id)
        loose_m_pids = method_parameter_ids - parameter_ids
        if len(loose_m_pids) > 0:
            self.log_console.warning(('%d loose parameter_ids in '
                                      'method_parameter table'),
                                     len(loose_m_pids))
            print('\n'.join(i for i in loose_m_pids), file=sys.stderr)
        loose_mt_pids = method_tag_parameter_ids - parameter_ids
        if len(loose_mt_pids) > 0:
            self.log_console.warning(('%d loose parameter_ids in '
                                      'method_tag table'),
                                     len(loose_mt_pids))
            print('\n'.join(i for i in loose_mt_pids), file=sys.stderr)
        property_ids = set()
        q = 'SELECT property_id FROM property'
        for r in self.db.execute(q).fetchall():
            property_ids.add(r.property_id)
        mmp_property_ids = set()
        q = 'SELECT DISTINCT property_id FROM molecule_method_property'
        for r in self.db.execute(q).fetchall():
            mmp_property_ids.add(r.property_id)
        loose_mmp_property_ids = mmp_property_ids - property_ids
        if len(loose_mmp_property_ids) > 0:
            self.log_console.warning(('%d loose property_ids in '
                                      'method_tag table'),
                                     len(loose_mmp_property_ids))
            print('\n'.join(i for i in loose_mmp_property_ids),
                  file=sys.stderr)
        # check that methods in db exist in methods dir
        method_ids = set()
        q = 'SELECT method_id, name FROM method'
        for r in self.db.execute(q).fetchall():
            method_ids.add(r.method_id)
        # check method foreign keys
        method_parameter_mids = set()
        q = 'SELECT DISTINCT method_id FROM method_parameter'
        for r in self.db.execute(q).fetchall():
            method_parameter_mids.add(r.method_id)
        method_edge_mids = set()
        q = ('SELECT DISTINCT parent_method_id, child_method_id  '
             'FROM method_edge')
        for r in self.db.execute(q).fetchall():
            method_edge_mids.add(r.parent_method_id)
            method_edge_mids.add(r.child_method_id)
        loose_method_parameter_mids = method_parameter_mids - method_ids
        if len(loose_method_parameter_mids) > 0:
            self.log_console.warning(('%d loose method_ids in '
                                      'method_parameter table'),
                                     len(loose_method_parameter_mids))
            print('\n'.join(i for i in loose_method_parameter_mids),
                  file=sys.stderr)
        loose_method_edge_mids = method_edge_mids - method_ids
        if len(loose_method_edge_mids) > 0:
            self.log_console.warning(('%d loose method_ids in '
                                      'method_edge table'),
                                     len(loose_method_edge_mids))
            print('\n'.join(i for i in loose_method_edge_mids),
                  file=sys.stderr)
        # check edge foreign keys
        method_edge_ids = set()
        q = 'SELECT method_edge_id FROM method_edge'
        for r in self.db.execute(q).fetchall():
            method_edge_ids.add(r.method_edge_id)
        method_path_edge_ids = set()
        q = 'SELECT DISTINCT method_edge_id FROM method_path_edge'
        for r in self.db.execute(q).fetchall():
            method_path_edge_ids.add(r.method_edge_id)
        loose_method_path_edge_ids = method_path_edge_ids - method_edge_ids
        if len(loose_method_path_edge_ids) > 0:
            self.log_console.warning(('%d loose method_edge_ids in '
                                      'method_path_edge table'),
                                     len(loose_method_path_edge_ids))
            print('\n'.join(i for i in loose_method_path_edge_ids),
                  file=sys.stderr)
        # check path foreign keys
        method_path_ids = set()
        q = 'SELECT method_path_id FROM method_path'
        for r in self.db.execute(q).fetchall():
            method_path_ids.add(r.method_path_id)
        method_path_edge_pids = set()
        q = 'SELECT DISTINCT method_path_id FROM method_path_edge'
        for r in self.db.execute(q).fetchall():
            method_path_edge_pids.add(r.method_path_id)
        loose_method_path_edge_pids = method_path_edge_pids - method_path_ids
        if len(loose_method_path_edge_pids) > 0:
            self.log_console.warning(('%d loose method_path_ids in '
                                      'method_path_edge table'),
                                     len(loose_method_path_edge_pids))
            print('\n'.join(i for i in loose_method_path_edge_pids),
                  file=sys.stderr)
        # check edge closure
        # check path completeness, connectedness, and length concordance
    
    def summary(self):
        """Print summary statistics about molecules in MESS.DB."""
        self.log_console.info('%d molecules in MESS.DB',
                              len(self.db_inchikeys))
Пример #3
0
 def execute(self, args):
     """Match molecules to SMARTS patterns."""
     if args.inchikeys.name == '<stdin>' and args.inchikeys.isatty():
         sys.exit('No input specified.')
     # parse args
     if not (args.smarts or args.fingerprint or args.spectrophore):
         sys.exit('No operations were selected, nothing to match.')
     if sum(bool(arg) for arg in (args.smarts,
                                  args.fingerprint,
                                  args.spectrophore)) > 1:
         sys.exit(('One thing at a time, please. The arguments --smarts, '
                   '--fingerprint, and --spectrophore are mutually '
                   'exclusive.'))
     if args.smarts and args.target:
         self.log_console.warning(('--target ignored, proceeding with '
                                   'SMARTS matching'))
     if args.spectrophore:
         if args.path is None:
             sys.exit(('Spectrophore calculation requires 3D geometry. '
                       'You must specify a 3D geometry with --path.'))
         else:
             path = MethodPath()
             path.set_path(args.path)
             method_dir = path.get_path_directory()
         sp_args = {'normalization': args.spectrophore_normalization,
                    'accuracy': args.spectrophore_accuracy,
                    'stereo': args.spectrophore_stereospecificity,
                    'resolution': args.spectrophore_resolution}
     # load target and target fingerprints
     target_mol = None
     target_fp = None
     target_sp = None
     if args.target:
         if os.path.exists(args.target):
             target_mol = pybel.readfile(args.target.split('.')[-1],
                                         args.target).next()
         else:
             target_mol = pybel.readstring('smi', args.target)
     if target_mol is not None:
         if args.fingerprint:
             target_fp = self.calculate_fingerprint(target_mol,
                                                    args.fingerprint)
         if args.spectrophore:
             target_sp = self.calculate_spectrophore(target_mol, sp_args)
     # match every input
     db = MessDB()
     inchi_query = 'SELECT inchi FROM molecule WHERE inchikey = ?'
     fp_query = ('SELECT fingerprint FROM molecule_fingerprint '
                 'WHERE inchikey = ? AND name = ? '
                 'AND settings = ? AND method_path_id = ?')
     writer = csv.writer(sys.stdout, delimiter=args.delimiter)
     for row in args.inchikeys:
         inchikey = row.split()[0].strip()
         if args.smarts or args.fingerprint:
             inchi = db.execute(inchi_query, (inchikey,)).fetchone()[0]
             mol = pybel.readstring('inchi', 'InChI=%s' % inchi)
         if args.smarts:
             canonical = pybel.ob.OBOp.FindType(b"canonical")
             canonical.Do(mol.OBMol)
             for (smarts_obj,
                  smarts_str) in self.smarts_generator(args.smarts):
                 matches = [match for match in smarts_obj.findall(mol)]
                 if len(matches) > 0:
                     writer.writerow([inchikey, smarts_str] + matches)
         if args.fingerprint:
             try:
                 fp = db.execute(fp_query, (inchikey, args.fingerprint,
                                            '', '')).fetchone()[0]
             except TypeError:
                 fp = self.calculate_fingerprint(mol, args.fingerprint)
             if target_fp is not None:
                 similarity = self.calculate_similarity(target_fp, fp,
                                                        'tanimoto')
                 if similarity > args.cutoff:
                     writer.writerow([inchikey, args.fingerprint,
                                      args.target, similarity])
             else:
                 writer.writerow([inchikey, args.fingerprint] + fp)
         if args.spectrophore:
             try:
                 sp = db.execute(fp_query, (inchikey, 'Spectrophore',
                                            json.dumps(sp_args,
                                                       sort_keys=True),
                                            args.path)).fetchone()[0]
             except TypeError:
                 xyz_file = os.path.join(get_inchikey_dir(inchikey),
                                         method_dir,
                                         '%s.xyz' % inchikey)
                 mol = pybel.readfile('xyz', xyz_file).next()
                 sp = Match.calculate_spectrophore(mol, sp_args)
             if target_sp is not None:
                 try:
                     similarity = self.calculate_similarity(target_sp, sp,
                                                            'cos')
                 except ValueError:
                     similarity = 0
                 if similarity > args.cutoff:
                     writer.writerow([inchikey, 'Spectrophore',
                                      args.target, similarity])
             else:
                 writer.writerow([inchikey, 'Spectrophore'] + sp)
Пример #4
0
class AbstractMethod(object):
    """All methods must inherit from this class.
    
    Attributes:
        db (obj): A MessDB object
        method_name (str): The name of the method
        description (str): Description of method
        geop (bool): Whether the method generates a new geometry
        prog_name (str): Program name
        prog_version (str): Program version
        prog_url (str): Program url
        parameters (dict): Parameters that affect program execution
    """
    parameters = dict()
    shortdesc = None
    method_citation = None
    prog_citation = None
    _inchikey = None
    _path_id = None
    _parent_path_id = None
    _method_dir = None
    _parent_method_dir = None
    
    def __init__(self):
        """Set up db, check for attributes, dependencies, and setup."""
        self.db = MessDB()
        self.path = MethodPath()
        self.log_console = Log('console')
        self.log_all = Log('all')
        self.method_name = self.get_method_name()
        try:
            self.parameters
            self.description
            self.geop  # flag indicates method results in new xyz coordinates
            self.prog_name
            self.prog_version
            self.prog_url
        except AttributeError as err:
            print(''.join([str(err), '\n']), file=sys.stderr)
            sys.exit(('Each method class needs to define description, geop, '
                      'prog_name, prog_version, prog_url, '
                      'parameters as attributes.'))
        self.check_dependencies()
    
    def __hash__(self):
        """Hash based on method name and parameters.
        
        Returns:
            A hex string of the sha1 hash of self.method_name plus
            JSON-serialized self.parameters. Keys are sorted.
        """
        return hashlib.sha1(self.method_name +
                            json.dumps(dict((str(k).lower(),
                                             str(v).lower())
                                            for k, v
                                            in self.parameters.iteritems()),
                                       sort_keys=True)).hexdigest()
    
    @property
    def hash(self):
        """Get hash."""
        return self.__hash__()
    
    @property
    def method_id(self):
        """Get the object's method_id attribute."""
        query = ('SELECT method_id FROM method '
                 'WHERE hash = ?;')
        row = self.db.execute(query, (self.hash,)).fetchone()
        return row.method_id
    
    @property
    def path_id(self):
        """Get the path id of the method."""
        if not self.path.get_method_id() == self.method_id:
            self._setup_path()
        return self._path_id
    
    @property
    def method_dir(self):
        """Get the directory name of the method."""
        if not self.path.get_method_id() == self.method_id:
            self._setup_path()
        return self._method_dir
    
    @property
    def parent_method_dir(self):
        """Get the parent directory name of the method."""
        if not self.path.get_method_id() == self.method_id:
            self._setup_path()
        return self._parent_method_dir
    
    @property
    def inchikey(self):
        """Get inchikey."""
        return self._inchikey
    
    @inchikey.setter
    def inchikey(self, inchikey):
        """Set inchikey, and update inchikey of logger."""
        if inchikey is not None and not is_inchikey(inchikey):
            raise RuntimeError('invalid inchikey: %s' % inchikey)
        self._inchikey = inchikey
        self.log_all.inchikey = inchikey
    
    @classmethod
    def get_method_name(cls):
        """Return the name of the method, derived from the subclass name."""
        return cls.__name__.replace('_', '').lower()
    
    def _setup_path(self):
        """Setup path given current method id and parent path."""
        self.path.setup_path(self.method_id, self._parent_path_id)
        self._path_id = self.path.get_path_id()
        self._method_dir = self.path.get_path_directory()
        self._parent_method_dir = self.path.get_parent_path_directory()
    
    def _insert_method(self):
        """Set insert program to db, set up hash, and insert method to db."""
        total_changes = self.db.total_changes
        query = ('INSERT OR IGNORE INTO method '
                 '(program_id, geop, name, shortdesc, citation, hash) '
                 'SELECT program.program_id, ?, ?, ?, ?, ? '
                 'FROM program '
                 'WHERE program.name=? AND program.version=?')
        self.db.execute(query, (self.geop, self.method_name, self.shortdesc,
                                self.method_citation, self.hash,
                                self.prog_name, self.prog_version))
        if self.db.total_changes - total_changes > 0:
            self.log_all.info('new %s method added to MESS.DB',
                              self.method_name)
    
    def _insert_program(self):
        """Adds row to program table in mess.db."""
        total_changes = self.db.total_changes
        query = ('INSERT OR IGNORE INTO program '
                 '(name, version, url, citation) '
                 'VALUES (?, ?, ?, ?)')
        self.db.execute(query,
                        (self.prog_name, self.prog_version, self.prog_url,
                         self.prog_citation))
        if self.db.total_changes - total_changes > 0:
            self.log_all.info('program %s %s added to MESS.DB',
                              self.prog_name, self.prog_version)
    
    def _insert_parameters(self):
        """Import paramaters dict to mess.db.
        
        Args:
            name: Name of parameter.
            setting: The value the parameter is set to.
        """
        added_parameters = 0
        for name, setting in self.parameters.items():
            query = ('INSERT OR IGNORE INTO parameter (name) VALUES (?)')
            self.db.execute(query, (name, ))
            total_changes = self.db.total_changes
            query = ('INSERT OR IGNORE INTO method_parameter '
                     '(method_id, parameter_id, setting) '
                     'SELECT ?, parameter.parameter_id, ? '
                     'FROM program, parameter '
                     'WHERE parameter.name=?')
            self.db.execute(query, (self.method_id, setting, name))
            added_parameters += (self.db.total_changes - total_changes)
        if added_parameters > 0:
            self.log_all.info('%i method parameters added to MESS.DB',
                              added_parameters)
    
    def get_insert_property_query(self, inchikey, name, description,
                                  format_, value, units=''):
        """Returns query to insert property value to mess.db.
        
        Args:
            inchikey: The inchikey of a molecule in MESS.DB.
            method_path_id: Path id for the calculations that generated the
                            property.
            name: The property name.
            description: A description of the property.
            format_: A description of the format the property is in.
            value: The calculated property.
            units: Units for the property value.
        """
        query = ('INSERT OR IGNORE INTO molecule_method_property_denorm '
                 'VALUES (?, ?, ?, ?, ?, ?, ?);')
        return (query, (inchikey, self.path_id, name, description,
                        format_, units, value))
    
    def get_insert_moldata_queries(self, inchikey, mol,
                                   description='', units=''):
        """Returns queries to insert molecule data values to mess.db."""
        for name, value in mol.data.iteritems():
            yield self.get_insert_property_query(inchikey,
                                                 name,
                                                 description,
                                                 type(value).__name__,
                                                 value,
                                                 units)
    
    def get_timing_query(self, inchikey, start):
        """Get a query to insert execution time property into db."""
        return self.get_insert_property_query(inchikey, 'runtime',
                                              'execution time',
                                              type(start).__name__,
                                              time.time() - start, 's')
    
    def set_parent_path(self, parent_path):
        """Set the parent path (e.g., path to method containing input
        geometry.)"""
        if parent_path > 0:
            self._parent_path_id = parent_path
    
    def has_parent_path(self, inchikey):
        """Returns True if molecule has had entire parent path calculated,
        False otherwise."""
        query = ('SELECT inchikey FROM molecule_method_property WHERE '
                 'inchikey = ? AND method_path_id = ?')
        try:
            self.db.execute(query,
                            (inchikey, self._parent_path_id)).fetchone()[0]
            return True
        except TypeError:
            return False
    
    def check_dependencies(self):
        """If check_dependencies is not implemented, raise error."""
        raise NotImplementedError(("every method needs a 'check_dependencies' "
                                   'method'))
    
    def check(self):
        """If check is not implemented, raise error."""
        # the check method should be called before a calculation (so
        # calculations are not repeated) and after (to verify success)
        raise NotImplementedError("every method needs a 'check' method")
    
    def map(self, inchikey, inchikey_dir):
        """Generally, maps molecule to calculation via method, emits
        query/value pairs.
        """
        raise NotImplementedError(("every method needs a 'map' method"))
    
    def reduce(self, query, values):
        """Run queries/values on the db."""
        total_changes = self.db.total_changes
        if query or values[0]:
            self.db.executemany(query, values)
            self.log_all.info('%i properties added to MESS.DB',
                              self.db.total_changes - total_changes)
            total_changes = self.db.total_changes
    
    def setup(self):
        """Set up method."""
        self._insert_program()
        self._insert_method()
        self._insert_parameters()
Пример #5
0
 def execute(self, args):
     """Run select query, output table."""
     if args.inchikeys.name == '<stdin>' and args.inchikeys.isatty():
         filter_from = None
     else:
         try:
             filter_from = set(row.split()[0].strip()
                               for row in args.inchikeys)
         except IndexError:
             filter_from = set([])
             return
     if args.query and (args.property_name
                        or args.property_operator
                        or args.property_value):
         sys.exit(('Custom SQL queries are mutually exclusive with '
                   'property filtering.'))
     if (args.part or args.of) and not (args.part and args.of):
         sys.exit(('If you specify a --part n, you must also specify --of '
                   'N (e.g. something like --part 1 --of 5).'))
     if args.part and args.of:
         if args.part > args.of:
             sys.exit('--part must be smaller than --of.')
         if args.part < 1:
             sys.exit('--part must be >=1.')
         alpha = string.ascii_uppercase
         alpha3 = [''.join([a, b, c]) for a in alpha
                                      for b in alpha
                                      for c in alpha]  # AAA to ZZZ
         if args.of > len(alpha3):
             sys.exit(('MESS.DB does not support subsetting into more than '
                       '%i parts.' % len(alpha3)))
         subsets = [alpha3[i::args.of] for i in xrange(args.of)]
         subset = subsets[args.part - 1]
     db = MessDB()
     cur = db.cursor()
     if args.query:
         try:
             cur.execute(codecs.open(args.query, encoding='utf-8').read())
         except sqlite3.OperationalError:
             sys.exit("'%s' does not contain valid sql." % args.query)
         except IOError:
             try:
                 cur.execute(args.query)
             except sqlite3.OperationalError:
                 sys.exit(("'%s' is neither valid sql nor a path "
                           'to a file containing valid sql.') % args.query)
     elif (args.property_name and args.property_operator and
           args.property_value is not None):
         query, values = self.property_query(args.property_name,
                                             args.property_operator,
                                             args.property_value,
                                             args.path)
         cur.execute(query, values)
     else:
         cur.execute('SELECT inchikey FROM molecule')
     # check that sql returns inchikey in first column
     if not cur.description[0][0].lower() == 'inchikey':
         sys.exit('Query must return inchikey in first column.')
     # print table
     writer = csv.writer(sys.stdout, delimiter=args.delimiter)
     if args.headers:
         writer.writerow(list(h[0] for h in cur.description))
     for result in cur:
         if filter_from is not None and result[0] not in filter_from:
             continue
         if args.regex_subset and not re.match(args.regex_subset, result[0],
                                               re.IGNORECASE):
             continue
         if args.part and args.of:
             if not any(result[0].startswith(a) for a in subset):
                 continue
         if args.smarts:
             matches = 0
             query = 'SELECT inchi FROM molecule WHERE inchikey = ?'
             inchi = db.execute(query, (result[0],)).fetchone()[0]
             mol = pybel.readstring('inchi', 'InChI=%s' % inchi)
             for (smarts_obj,
                  smarts_str) in Match.smarts_generator(args.smarts):
                 matches += len(smarts_obj.findall(mol))
             if not matches:
                 continue
         writer.writerow(list(xstr(v).decode('utf-8') for v in result))
     db.close()  # must be closed manually to prevent db locking during pipe
Пример #6
0
class Annotate(AbstractTool):
    """This tool annotates molecules with synonyms (common names, CAS, etc) and
    various fingerprints.
    """
    
    def __init__(self):
        """Set description of tool."""
        self.description = 'Annotate molecules with synonyms and fingerprints'
        self.epilog = ''
    
    def subparse(self, subparser):
        """Set tool-specific argparse arguments."""
        subparser.add_argument('inchikeys', nargs='?',
                               type=argparse.FileType('r'), default=sys.stdin,
                               help=('a list of inchikeys (default: STDIN)'))
        subparser.add_argument('-c', '--cir', action='store_true',
                               help=('get IUPAC names and other synonyms from '
                                     'the Chemical Information Resolver web '
                                     'service'))
        subparser.add_argument('-f', '--fingerprint', type=str,
                               choices=[b'FP2', b'FP3', b'FP4',
                                        b'MACCS', b'MNA', b'MPD'],
                               help=('calculate fingerprint'))
        subparser.add_argument('-s', '--spectrophore', action='store_true',
                               help=('calculate Spectrophore '
                                     'descriptor/fingerprint; '
                                     'requires 3D geometry (i.e., you must '
                                     'set a path to a method that has '
                                     'generated xyz coordinates)'))
        subparser.add_argument('-p', '--path', type=int, default=None,
                               help=('specify a path id, only used for '
                                     'Spectrophore'))
        sp_group = subparser.add_argument_group(('Spectrophore optional '
                                                 'arguments'))
        sp_group.add_argument('-sn', '--spectrophore-normalization', type=str,
                              default='No', choices=[b'No',
                                                     b'ZeroMean',
                                                     b'UnitStd',
                                                     b'ZeroMeanAndUnitStd'],
                              help=('perform normalization of Spectrophore'))
        sp_group.add_argument('-sa', '--spectrophore-accuracy', type=int,
                              default=20, choices=[1, 2, 5, 10, 15,
                                                   20, 30, 36, 45, 60],
                              help=('Spectrophore accuracy expressed as '
                                    'angular stepsize; lower is more accurate'
                                    'but slower'))
        sp_group.add_argument('-ss', '--spectrophore-stereospecificity',
                              type=str, default='No',
                              choices=[b'No', b'Unique', b'Mirror', b'All'],
                              help=('cage type in terms of the underlying '
                                    'pointgroup: P1 or P-1'))
        sp_group.add_argument('-sr', '--spectrophore-resolution', type=float,
                              metavar='FLOAT', default=3.0,
                              help=('required Spectrophore resolution in '
                                    'Angstroms'))
    
    def execute(self, args):
        """Match molecules to SMARTS patterns."""
        if args.inchikeys.name == '<stdin>' and args.inchikeys.isatty():
            sys.exit('No input specified.')
        if not (args.cir or args.fingerprint or args.spectrophore):
            sys.exit('You did not request any annotations.')
        if args.spectrophore:
            if args.path is None:
                sys.exit(('Spectrophore calculation requires 3D geometry. '
                          'You must specify a 3D geometry with --path.'))
            else:
                path = MethodPath()
                path.set_path(args.path)
                method_dir = path.get_path_directory()
                sp_args = {'normalization': args.spectrophore_normalization,
                           'accuracy': args.spectrophore_accuracy,
                           'stereo': args.spectrophore_stereospecificity,
                           'resolution': args.spectrophore_resolution}
        self.db = MessDB()
        inchi_select_query = 'SELECT inchi FROM molecule WHERE inchikey = ?'
        fp_select_query = ('SELECT fingerprint FROM molecule_fingerprint '
                           'WHERE inchikey = ? '
                           'AND name = ? '
                           'AND settings = ? '
                           'AND method_path_id = ?')
        fp_insert_query = ('INSERT INTO molecule_fingerprint '
                           '(inchikey, name, settings, '
                           'fingerprint, method_path_id) '
                           'VALUES (?, ?, ?, ?, ?)')
        for row in args.inchikeys:
            self.inchikey = row.split()[0].strip()
            if args.cir:
                self.update_iupac(self.inchikey)
                self.update_synonyms(self.inchikey)
            if args.fingerprint:
                inchi = self.db.execute(inchi_select_query,
                                        (self.inchikey,)).fetchone()[0]
                mol = pybel.readstring('inchi', 'InChI=%s' % inchi)
                canonical = pybel.ob.OBOp.FindType(b'canonical')
                canonical.Do(mol.OBMol)
                fp = Match.calculate_fingerprint(mol, args.fingerprint)
                try:
                    db_fp = self.db.execute(fp_select_query,
                                            (self.inchikey,
                                             args.fingerprint,
                                             '',
                                             '')).fetchone()[0]
                    if not str(fp) == db_fp:
                        self.log_console.warning(('new %s fingerprint '
                                                  'for %s did not match '
                                                  'fingerprint in db, '
                                                  'db not updated'),
                                                 args.fingerprint,
                                                 self.inchikey)
                except TypeError:
                    self.db.execute(fp_insert_query, (self.inchikey,
                                                      args.fingerprint,
                                                      '',
                                                      str(fp),
                                                      ''))
                    self.log_all.info('%s fingerprint for %s added to db',
                                      args.fingerprint, self.inchikey)
            if args.spectrophore:
                xyz_file = os.path.join(get_inchikey_dir(self.inchikey),
                                        method_dir,
                                        '%s.xyz' % self.inchikey)
                mol = pybel.readfile('xyz', xyz_file).next()
                sp = Match.calculate_spectrophore(mol, sp_args)
                try:
                    db_sp = self.db.execute(fp_select_query,
                                            (self.inchikey,
                                             'Spectrophore',
                                             json.dumps(sp_args,
                                                        sort_keys=True),
                                             args.path)).fetchone()[0]
                    if not str(sp) == db_sp:
                        self.log_console.warning(('new Spectrophore '
                                                  'fingerprint for '
                                                  '%s did not match '
                                                  'fingerprint in db, '
                                                  'db not updated'),
                                                 self.inchikey)
                except TypeError:
                    json_sp_args = json.dumps(sp_args, sort_keys=True)
                    self.db.execute(fp_insert_query, (self.inchikey,
                                                      'Spectrophore',
                                                      json_sp_args,
                                                      str(sp),
                                                      args.path))
                    self.log_all.info(('Spectrophore fingerprint for %s '
                                       'with parameters %s and '
                                       'geometry from path %i '
                                       'added to db'),
                                      self.inchikey, json_sp_args, args.path)
    
    def update_synonyms(self, inchikey):
        """Get synonyms from CIR and load them into mess.db."""
        new_synonyms = 0
        synonyms = self.cir_request(inchikey, 'names')
        if synonyms:
            select_query = ('SELECT inchikey FROM molecule_synonym '
                            'WHERE inchikey = ? AND name = ?')
            insert_query = ('INSERT INTO molecule_synonym (inchikey, name) '
                            'VALUES (?, ?)')
            for synonym in synonyms.split('\n'):
                if self.db.execute(select_query,
                                   (inchikey, synonym)).fetchone() is None:
                    self.db.execute(insert_query, (inchikey, synonym))
                    new_synonyms += 1
            if new_synonyms > 0:
                if new_synonyms > 1:
                    plural = 's'
                else:
                    plural = ''
                self.log_all.info('%i new synonym%s for %s added',
                                  new_synonyms, plural, inchikey)
    
    def update_iupac(self, inchikey):
        """Get IUPAC name from CIR and load it into mess.db."""
        iupacs = []
        iupac = None
        new_synonyms = 0
        try:
            iupacs = self.cir_request(inchikey,
                                      'iupac_name').splitlines(True)
            # if multiple iupacs, take the longest (most specific) one
            iupac = max(iupacs, key=len).rstrip()
        except AttributeError:
            return
        if iupac is not None:
            iupac_select_query = ('SELECT iupac FROM molecule '
                                  'WHERE inchikey = ?')
            iupac_update_query = ('UPDATE molecule SET iupac = ? '
                                  'WHERE inchikey = ?')
            db_iupac = self.db.execute(iupac_select_query,
                                       (inchikey, )).fetchone()[0]
            if not db_iupac == iupac:
                self.db.execute(iupac_update_query, (iupac, inchikey))
                self.log_all.info('iupac name for %s updated', inchikey)
            if len(iupacs) > 1:  # if multiple, add others as synonym
                select_query = ('SELECT inchikey FROM molecule_synonym '
                                'WHERE inchikey = ? AND name = ?')
                insert_query = ('INSERT INTO molecule_synonym '
                                '(inchikey, name) VALUES (?, ?)')
                for i in iupacs:
                    if i != max(iupacs, key=len):  # ignore longest iupac
                        synonym = i.rstrip()
                        if self.db.execute(select_query,
                                           (inchikey,
                                            synonym)).fetchone() is None:
                            self.db.execute(insert_query, (inchikey,
                                                           synonym))
                            new_synonyms += 1
                if new_synonyms > 0:
                    if new_synonyms > 1:
                        plural = 's'
                    else:
                        plural = ''
                    self.log_all.info('%i new synonym%s for %s added',
                                      new_synonyms, plural, inchikey)
    
    def cir_request(self, inchikey, representation):
        """Make request to CIR (Chemical Information Resolver).
        
        Args:
            inchikey: A valid InChIKey.
            representation: The representation desired from CIR.
        
        Returns:
            CIR's response, or None if there isn't one.
        """
        url = 'http://cactus.nci.nih.gov/chemical/structure/%s/%s' %\
              (inchikey, representation)
        headers = {'User-Agent': 'MESS.DB'}
        request = urllib2.Request(url, None, headers)
        try:
            response = urllib2.urlopen(request)
            if response.getcode() == 200:
                time.sleep(0.2)  # protect cactus from hammering
                return response.read()
        except urllib2.URLError as err:
            if hasattr(err, 'reason'):
                reason = err.reason.lower()
                self.log_console.info('%s %s %s in cir',
                                      inchikey, representation, reason)
        return None