def test_get_formula_and_connectivity(self): glc6p = molecule_util.InchiMolecule( 'InChI=1S/C6H13O9P/c7-3-2(1-14-16(11,12)13)15-6(10)5(9)4(3)8/h2-10H,1H2,(H2,11,12,13)/t2-,3-,4+,5-,6-/m1/s1' ) self.assertEqual(glc6p.get_formula_and_connectivity(), 'C6H13O9P/c7-3-2(1-14-16(11,12)13)15-6(10)5(9)4(3)8') water = molecule_util.InchiMolecule('InChI=1S/H2O/h1H2') self.assertEqual(water.get_formula_and_connectivity(), 'H2O')
def test_is_protonation_isomer(self): a = molecule_util.InchiMolecule( 'InChI=1S/C6H13O9P/c7-3-2(1-14-16(11,12)13)15-6(10)5(9)4(3)8/h2') b = molecule_util.InchiMolecule( 'InChI=1S/C6H13O9P/c7-3-2(1-14-16(11,12)13)15-6(10)5(9)4(3)8/h3') c = molecule_util.InchiMolecule('InChI=1S/C6H13O9P/c7/h4') self.assertTrue(a.is_protonation_isomer(a)) self.assertTrue(b.is_protonation_isomer(b)) self.assertFalse(a.is_protonation_isomer(c))
def test_is_tautomer(self): a = molecule_util.InchiMolecule( 'InChI=1S/C5H5N5O/c6-5-9-3-2(4(11)10-5)7-1-8-3/h1H,(H4,6,7,8,9,10,11)/t1' ) b = molecule_util.InchiMolecule( 'InChI=1S/C5H5N5O/c6-5-9-3-2(4(11)10-5)7-1-8-3/h1H,(H4,6,7,8,9,10,11)/t2' ) c = molecule_util.InchiMolecule( 'InChI=1S/C5H5N5O/c6/h1H,(H4,6,7,8,9,10,11)/t2') self.assertTrue(a.is_tautomer(a)) self.assertTrue(a.is_tautomer(a)) self.assertFalse(a.is_tautomer(c))
def test_is_stereoisomer(self): glc6p = molecule_util.InchiMolecule( 'InChI=1S/C6H13O9P/c7-3-2(1-14-16(11,12)13)15-6(10)5(9)4(3)8/h2-10H,1H2,(H2,11,12,13)/t2-,3-,4+,5-,6-/m1/s1' ) gal6p = molecule_util.InchiMolecule( 'InChI=1S/C6H13O9P/c7-3-2(1-14-16(11,12)13)15-6(10)5(9)4(3)8/h2-10H,1H2,(H2,11,12,13)/t2-,3+,4+,5-,6?/m1/s1' ) fru6p = molecule_util.InchiMolecule( 'InChI=1S/C6H13O9P/c7-1-3(8)5(10)6(11)4(9)2-15-16(12,13)14/h4-7,9-11H,1-2H2,(H2,12,13,14)/t4-,5-,6-/m1/s1' ) self.assertTrue(glc6p.is_stereoisomer(glc6p)) self.assertTrue(glc6p.is_stereoisomer(gal6p)) self.assertFalse(glc6p.is_stereoisomer(fru6p))
def test_is_equal(self): a = molecule_util.InchiMolecule('InChI=1S/BrH/h1H/p-1') c = molecule_util.InchiMolecule('InChI=1S/BrH/h1H') self.assertTrue(a.is_equal(a)) self.assertFalse(a.is_equal(c)) glc6p = molecule_util.InchiMolecule( 'InChI=1S/C6H13O9P/c7-3-2(1-14-16(11,12)13)15-6(10)5(9)4(3)8/h2-10H,1H2,(H2,11,12,13)/t2-,3-,4+,5-,6-/m1/s1' ) gal6p = molecule_util.InchiMolecule( 'InChI=1S/C6H13O9P/c7-3-2(1-14-16(11,12)13)15-6(10)5(9)4(3)8/h2-10H,1H2,(H2,11,12,13)/t2-,3+,4+,5-,6?/m1/s1' ) fru6p = molecule_util.InchiMolecule( 'InChI=1S/C6H13O9P/c7-1-3(8)5(10)6(11)4(9)2-15-16(12,13)14/h4-7,9-11H,1-2H2,(H2,12,13,14)/t4-,5-,6-/m1/s1' ) self.assertTrue(glc6p.is_equal(glc6p, check_stereochemistry=False)) self.assertTrue(glc6p.is_equal(gal6p, check_stereochemistry=False)) self.assertFalse(glc6p.is_equal(fru6p, check_stereochemistry=False))
def test(self): inchi = 'InChI=1S/C3H4O3/c1-2(4)3(5)6/h1H3,(H,5,6)' layers = molecule_util.InchiMolecule(inchi) self.assertEqual( layers.__dict__, { 'formula': 'C3H4O3', 'connections': '1-2(4)3(5)6', 'hydrogens': '1H3,(H,5,6)', 'protons': '', 'charge': '', 'double_bonds': '', 'stereochemistry': '', 'stereochemistry_parity': '', 'stereochemistry_type': '', 'isotopes': '', 'fixed_hydrogens': '', 'reconnected_metals': '', }) self.assertEqual(str(layers), inchi) inchi = 'InChI=1S/C3H4O3/c1-2(4)3(5)6' layers = molecule_util.InchiMolecule(inchi) self.assertEqual( layers.__dict__, { 'formula': 'C3H4O3', 'connections': '1-2(4)3(5)6', 'hydrogens': '', 'protons': '', 'charge': '', 'double_bonds': '', 'stereochemistry': '', 'stereochemistry_parity': '', 'stereochemistry_type': '', 'isotopes': '', 'fixed_hydrogens': '', 'reconnected_metals': '', }) self.assertEqual(str(layers), inchi) inchi = 'InChI=1S/Ni/q+2' self.assertEqual(str(molecule_util.InchiMolecule(inchi)), inchi) inchi = 'InChI=1S/BrH/h1H/p-1' self.assertEqual(str(molecule_util.InchiMolecule(inchi)), inchi) inchi = 'InChI=1S/p+1' self.assertEqual(str(molecule_util.InchiMolecule(inchi)), inchi) inchi = 'InChI=1S/4O.V/q;3*-1;' self.assertEqual(str(molecule_util.InchiMolecule(inchi)), inchi) inchi_1 = 'InChI=1/C6H12O6/c7-1-3-4(9)5(10)6(11,2-8)12-3/h3-5,7-11H,1-2H2/t3-,4-,5+,6u/m0/s1' inchi_2 = 'InChI=1S/C6H12O6/c7-1-3-4(9)5(10)6(11,2-8)12-3/h3-5,7-11H,1-2H2/t3-,4-,5+,6u/m0/s1' self.assertEqual(str(molecule_util.InchiMolecule(inchi_1)), inchi_2)
def to_inchi(self, only_formula_and_connectivity=False): """ Get the structure in InChi format Args: only_formula_and_connectivity (:obj:`bool`): if :obj:`True`, return only the formula and connectivity layers Returns: :obj:`str`: structure in InChi format or just the formula and connectivity layers if :obj:`only_formula_and_connectivity` is :obj:`True` """ inchi = molecule_util.Molecule(structure=self.structure).to_inchi() if only_formula_and_connectivity: return molecule_util.InchiMolecule( inchi).get_formula_and_connectivity() else: return inchi
def get_concentration_by_structure(self, inchi, only_formula_and_connectivity=True, select = models.Concentration): """ Args: inchi (:obj:`str`): inchi structure to find concentrations Returns: :obj:`list`: List of models.Concentration Objects """ q = self.data_source.session.query(select).join((models.Metabolite, select.metabolite)).\ join((models.Structure, models.Metabolite.structure)) if only_formula_and_connectivity: formula_and_connectivity = molecule_util.InchiMolecule(inchi).get_formula_and_connectivity() condition = models.Structure._structure_formula_connectivity == formula_and_connectivity else: condition = models.Structure._value_inchi == inchi return q.filter(condition).all()
def get_metabolites_by_structure(self, inchi, only_formula_and_connectivity=False, select=models.Metabolite): """ Get metabolites with the same structure. Optionally, get metabolites which only have the same core empirical formula and core atom connecticity (i.e. same InChI formula and connectivity layers). Args: inchi (:obj:`str`): molecule structure in InChI format only_formula_and_connectivity (:obj:`bool`, optional): if :obj:`True`, get metabolites which only have the same core empirical formula and core atom connecticity. if :obj:`False`, get metabolites with the identical structure. Returns: :obj:`sqlalchemy.orm.query.Query`: query for matching metabolites """ q = self.data_source.session.query(select).join((models.Structure, models.Metabolite.structure)) if only_formula_and_connectivity: formula_and_connectivity = molecule_util.InchiMolecule(inchi).get_formula_and_connectivity() condition = models.Structure._structure_formula_connectivity == formula_and_connectivity else: condition = models.Structure._value_inchi == inchi return q.filter(condition)
def load_content(self): """ Download the content of ECMDB and store it to a local sqlite database. """ db_session = self.session req_session = self.requests_session # download content from server if self.verbose: print('Downloading compound IDs ...') response = req_session.get(self.DOWNLOAD_INDEX_URL) response.raise_for_status() if self.verbose: print(' done') # unzip and parse content if self.verbose: print('Parsing compound IDs ...') with zipfile.ZipFile(io.BytesIO(response.content), 'r') as zip_file: with zip_file.open('ecmdb.json', 'r') as json_file: entries = json.load(json_file) if self.verbose: print(' found {} compounds'.format(len(entries))) # sort entires entries.sort(key=lambda e: e['m2m_id']) # limit number of processed entries if len(entries) > self.max_entries: entries = entries[0:self.max_entries] # load content into sqlite database if self.verbose: print('Downloading {} compounds ...'.format(len(entries))) xml_parser = jxmlease.Parser() for i_entry, entry in enumerate(entries): if self.verbose and (i_entry % 10 == 0): print(' Downloading compound {} of {}'.format( i_entry + 1, len(entries))) # get details response = req_session.get( self.DOWNLOAD_COMPOUND_URL.format(entry['m2m_id'])) try: response.raise_for_status() except requests.exceptions.HTTPError: warnings.warn( 'Unable to download data for compound {}'.format( entry['m2m_id']), data_source.DataSourceWarning) continue entry_details = xml_parser(response.text)['compound'] compound = self.get_or_create_object(Compound, id=self.get_node_text( entry_details['m2m_id'])) if 'name' in entry_details: compound.name = self.get_node_text(entry_details['name']) if 'description' in entry_details: compound.description = self.get_node_text( entry_details['description']) compound.structure = self.get_node_text(entry_details['inchi']) if not compound.structure: response2 = req_session.get( self.DOWNLOAD_COMPOUND_STRUCTURE_URL.format( entry['m2m_id'])) response2.raise_for_status() compound.structure = response2.text compound.comment = entry['comment'] compound.created = dateutil.parser.parse( self.get_node_text( entry_details['creation_date'])).replace(tzinfo=None) compound.updated = dateutil.parser.parse( self.get_node_text( entry_details['update_date'])).replace(tzinfo=None) # calculate core InChI layers to facilitate searching try: compound._structure_formula_connectivity = molecule_util.InchiMolecule(compound.structure) \ .get_formula_and_connectivity() except ValueError: warnings.warn( 'Unable to encode structure for {} in InChI'.format( entry['m2m_id']), data_source.DataSourceWarning) compound._structure_formula_connectivity = None # synonyms compound.synonyms = [] if 'iupac_name' in entry_details: node = entry_details['iupac_name'] name = self.get_node_text(node) compound.synonyms.append( self.get_or_create_object(Synonym, name=name)) if 'traditional_iupac' in entry_details: node = entry_details['traditional_iupac'] name = self.get_node_text(node) compound.synonyms.append( self.get_or_create_object(Synonym, name=name)) parent_node = entry_details['synonyms'] if 'synonym' in parent_node: nodes = self.get_node_children(parent_node, 'synonym') for node in nodes: name = self.get_node_text(node) compound.synonyms.append( self.get_or_create_object(Synonym, name=name)) # locations compound.compartments = [] parent_node = entry_details['cellular_locations'] if 'cellular_location' in parent_node: nodes = self.get_node_children(parent_node, 'cellular_location') for node in nodes: name = self.get_node_text(node) compound.compartments.append( self.get_or_create_object(Compartment, name=name)) # todo (enhancement): parse experimental properties # * state # * melting_point # * water_solubility # * logp_hydrophobicity # concentrations compound.concentrations = [] parent_node = entry_details['concentrations'] if 'concentration' in parent_node: values = self.get_node_children(parent_node, 'concentration') errors = self.get_node_children(parent_node, 'error') units = self.get_node_children(parent_node, 'concentration_units') strains = self.get_node_children(parent_node, 'strain') statuses = self.get_node_children(parent_node, 'growth_status') medias = self.get_node_children(parent_node, 'growth_media') temperatures = self.get_node_children(parent_node, 'temperature') systems = self.get_node_children(parent_node, 'growth_system') references = self.get_node_children(parent_node, 'reference') for i_conc in range(len(values)): value = float(self.get_node_text(values[i_conc])) error = float(self.get_node_text(errors[i_conc]) or 'nan') unit = self.get_node_text(units[i_conc]) if unit == 'uM': pass else: raise ValueError('Unsupport units: {}'.format(unit)) if temperatures[i_conc]: temperature, unit = self.get_node_text( temperatures[i_conc]).split(' ') temperature = float(temperature) if unit != 'oC': raise ValueError( 'Unsupport units: {}'.format(unit)) else: temperature = None concentration = Concentration( value=value, error=error, strain=self.get_node_text(strains[i_conc]) or None, growth_status=self.get_node_text(statuses[i_conc]) or None, media=self.get_node_text(medias[i_conc]) or None, temperature=temperature, growth_system=self.get_node_text(systems[i_conc]) or None, ) db_session.add(concentration) if 'pubmed_id' in references[i_conc]: pmid_nodes = self.get_node_children( references[i_conc], 'pubmed_id') for node in pmid_nodes: id = self.get_node_text(node) concentration.references.append( self.get_or_create_object(Resource, namespace='pubmed', id=id)) compound.concentrations.append(concentration) # cross references compound.cross_references = [] id = self.get_node_text(entry_details['biocyc_id']) if id: compound.cross_references.append( self.get_or_create_object(Resource, namespace='biocyc', id=id)) id = self.get_node_text(entry_details['cas_registry_number']) if id: compound.cross_references.append( self.get_or_create_object(Resource, namespace='cas', id=id)) id = self.get_node_text(entry_details['chebi_id']) if id: compound.cross_references.append( self.get_or_create_object(Resource, namespace='chebi', id='CHEBI:' + id)) id = self.get_node_text(entry_details['chemspider_id']) if id: compound.cross_references.append( self.get_or_create_object(Resource, namespace='chemspider', id=id)) id = self.get_node_text(entry_details['foodb_id']) if id: compound.cross_references.append( self.get_or_create_object(Resource, namespace='foodb.compound', id=id)) id = self.get_node_text(entry_details['het_id']) if id: compound.cross_references.append( self.get_or_create_object(Resource, namespace='ligandexpo', id=id)) id = self.get_node_text(entry_details['hmdb_id']) if id: compound.cross_references.append( self.get_or_create_object(Resource, namespace='hmdb', id=id)) id = self.get_node_text(entry_details['kegg_id']) if id: compound.cross_references.append( self.get_or_create_object(Resource, namespace='kegg.compound', id=id)) id = self.get_node_text(entry_details['msds_url']) if id: compound.cross_references.append( self.get_or_create_object(Resource, namespace='msds.url', id=id)) id = self.get_node_text(entry_details['pubchem_compound_id']) if id: compound.cross_references.append( self.get_or_create_object(Resource, namespace='pubchem.compound', id=id)) id = self.get_node_text(entry_details['wikipidia']) if id: compound.cross_references.append( self.get_or_create_object(Resource, namespace='wikipedia.en', id=id)) # add to session db_session.add(compound) if self.commit_intermediate_results and (i_entry % 100 == 99): db_session.commit() if self.verbose: print(' done') # commit changes to database if self.verbose: print('Saving database ...') db_session.commit() if self.verbose: print(' done')
def test_remove_layer(self): a = molecule_util.InchiMolecule('InChI=1S/BrH/h1H/p-1') a.remove_layer('protons') self.assertEqual(str(a), 'InChI=1S/BrH/h1H')