Exemplo n.º 1
0
def workOnCMG(CMGName):

	source_filename='CIDS_CMG/CIDS_%s.txt' %(CMGName)
	target_filename='CIDS_Results/%s.csv' %(CMGName)

	def batchIndexes(list_size):
		# Returns list of indicies to avoid timeout with pcp.get_synonyms
		batch_size = 300
		batches = int(list_size/batch_size) + 1
		remainder = list_size % batch_size
		begin = 0
		end = 0
		batch_indexes = []
		index = 0
		while index < batches:
			end = begin + batch_size - 1
			if end >= list_size - 1:
				batch_indexes.append((begin,list_size - 1))
				break
			batch_indexes.append((begin,end))
			begin = end
			index += 1
		return batch_indexes


	def cidsList(source_filename):
	# Build cids list

		cids=[]
		master=open(source_filename, 'r')
		for line in master:
			cids.append(line.replace('\n',''))
		master.close()
		return cids

	cids=cidsList(source_filename)

	findings = open(target_filename, 'a')

	for index in batchIndexes(len(cids)):
		print "Processing pcp.get_synonyms with cids batch ", index
		results = pcp.get_synonyms(cids[index[0]:index[1]])
		print "Finding CASRN matches in the synonyms ..."
		for result in results:
			for syn in result.get('Synonym', []):
				match = re.match('(\d{2,7}-\d\d-\d)', syn)
				c=pcp.Compound.from_cid(result.get('CID'))
	    		if match and c.iupac_name:
	    			findings.write(CMGName + "," + match.group(1) + "," + c.iupac_name + '\n')
	    		elif match and not c.iupac_name:
	    			findings.write(CMGName + "," + match.group(1) + ",\n")
	    		elif not match and c.iupac_name:
	    			findings.write(CMGName + ",," + c.iupac_name + '\n')
	findings.close()
Exemplo n.º 2
0
def workOnCMG(CMGName):

    source_filename = 'CIDS_CMG/CIDS_%s.txt' % (CMGName)
    target_filename = 'CIDS_Results/%s.csv' % (CMGName)

    def batchIndexes(list_size):
        # Returns list of indicies to avoid timeout with pcp.get_synonyms
        batch_size = 300
        batches = int(list_size / batch_size) + 1
        remainder = list_size % batch_size
        begin = 0
        end = 0
        batch_indexes = []
        index = 0
        while index < batches:
            end = begin + batch_size - 1
            if end >= list_size - 1:
                batch_indexes.append((begin, list_size - 1))
                break
            batch_indexes.append((begin, end))
            begin = end
            index += 1
        return batch_indexes

    def cidsList(source_filename):
        # Build cids list

        cids = []
        master = open(source_filename, 'r')
        for line in master:
            cids.append(line.replace('\n', ''))
        master.close()
        return cids

    cids = cidsList(source_filename)

    findings = open(target_filename, 'a')

    for index in batchIndexes(len(cids)):
        print "Processing pcp.get_synonyms with cids batch ", index
        results = pcp.get_synonyms(cids[index[0]:index[1]])
        print "Finding CASRN matches in the synonyms ..."
        for result in results:
            for syn in result.get('Synonym', []):
                match = re.match('(\d{2,7}-\d\d-\d)', syn)
                c = pcp.Compound.from_cid(result.get('CID'))
            if match and c.iupac_name:
                findings.write(CMGName + "," + match.group(1) + "," +
                               c.iupac_name + '\n')
            elif match and not c.iupac_name:
                findings.write(CMGName + "," + match.group(1) + ",\n")
            elif not match and c.iupac_name:
                findings.write(CMGName + ",," + c.iupac_name + '\n')
    findings.close()
Exemplo n.º 3
0
def get_cas_pcp(cid):
    """Extracts CAS fom synonyms of pubchempy.Compound or empty string.

    :param cid: (int) CID from PubChem.
    :return:
        cas: (str) CAS Registry Number.
    """

    results = pcp.get_synonyms(cid, 'cid')
    for result in results:
        for syn in result.get('Synonym', []):
            match = re.match('(\d{2,7}-\d\d-\d)', syn)
            if match:
                cas = match.group(1)
                return cas
    return ''
Exemplo n.º 4
0
    Entrez.email = '*****@*****.**'
    handle = Entrez.efetch(db='pubmed',
                           retmode='xml',
                           id=ids)
    results = Entrez.read(handle)
    return results



for i in range(110,729):
    text = ''
    count = 0
    for id in mydf[mydf[str(i)]==1].sample(frac=1).iterrows():
        # print(id[1]['0_x'])
        try:
            k = pcp.get_synonyms(id[1]['0_x'], 'inchi')
        except:
            k = []

        try:
            pubmedIDs = search(k[0]['Synonym'][0])
            papers = fetch_details(pubmedIDs['IdList'])
            count += 1

        except:
            print('NO RESULTS')
            papers = {}

        try:
            for x in papers['PubmedArticle']:
                print('#############################################')
Exemplo n.º 5
0
def extract_mol_from_pubchem(cas_nr):
    global download_path
    headers = {
        'user-agent':
        'Mozilla/5.0 (X11; CentOS; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36'
    }

    try:
        # print('\tSearching Pubchem...')

        # Using pubchem api for python
        # Getting CID number, the result of this, by default is exact match. The result is returned as a list.
        # cid = pcp.get_cids(cas_nr, 'name', 'substance', list_return='flat')
        cid = pcp.get_cids(cas_nr, 'name')

        file_name = cas_nr + '.mol'
        download_file = Path(download_path) / file_name

        # Check if the file not exists and download
        # check file exists: https://stackoverflow.com/questions/82831/how-do-i-check-whether-a-file-exists
        if download_file.exists() and os.stat(download_file).st_size != 0:
            # print('{} already downloaded'.format(file_name))
            return -1

        else:

            #  this api return an empty list if it cannot find cas_nr. This is to check if pubchem has this chemical.
            if len(cid) > 0:
                # if Pubchem found the result, get the first result of the list
                cid = cid[0]
                # print('Compound ID (CID) from PubChem is: {} and type is: {}'.format(cid, type(cid)))

                # To double check if the CAS number is correct:
                # using pubchem api, get a list of synonym. The result is a list of dict.
                # choose the first result and check first 5 values for 'Synonym' key:
                # synonyms = pcp.get_synonyms(cid)[0]['Synonym'][:7]
                synonyms = pcp.get_synonyms(cid)[0]['Synonym']
                # print('List of synonyms is: {}'.format(synonyms)); exit(0)

                if cas_nr not in synonyms:
                    raise ValueError('\tThis is not an exact match!')

                # get url from Fisher to get url to download sds file
                get_sdf_url = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/{}/sdf'.format(
                    cid)

                # # Check if the file not exists and download
                # # check file exists: https://stackoverflow.com/questions/82831/how-do-i-check-whether-a-file-exists
                # if download_file.exists():
                #     # print('{} already downloaded'.format(file_name))
                #     return -1
                # else:

                # # Another way to get sdf, from pubchempy ---------------------------------------
                #     sdf = pcp.get_sdf(cid)
                #     with open('159857-81-5.mol', 'w') as f:
                #         f.write(sdf)
                # # ----------------------------------------------------------------------------------

                # Get the html request info using CID number from pubchem
                r = requests.get(get_sdf_url, headers=headers, timeout=15)
                # print('url is: {}'.format(get_sdf_url))

                # Check to see if give OK status (200) and not redirect
                if r.status_code == 200 and len(r.history) == 0:
                    download_file.write_text(data=r.text)

                    # Check if the mol file is a binary string (some error during downloading) or empty mol file:
                    if is_binary_string(open(download_file, 'rb').read(
                            1024)) or is_empty_mol_file(download_file):
                        os.remove(download_file)  # remove the error mol file
                        return cas_nr
                    else:
                        return 0

            # If not, try to find substances as well
            elif len(cid) == 0:
                '''pcp.get_substances(cas_nr, 'name') returns a list of Substances if found: 
                Ref: https://github.com/mcs07/PubChemPy/blob/e3c4f4a9b6120433e5cc3383464c7a79e9b2b86e/pubchempy.py#L328'''
                substances = pcp.get_substances(cas_nr, 'name')
                # print(sid); exit(0)

                if len(substances) == 0:
                    # print('nothing here')
                    raise ValueError(
                        'Could not find any compounds or substances with this CAS {} on Pubchem.'
                        .format(cas_nr))
                else:
                    for substance in substances:
                        # print('Substance ID (SID) from PubChem is: {} and type is: {}'.format(substance, type(substance)))
                        '''Ref: https://github.com/mcs07/PubChemPy/blob/e3c4f4a9b6120433e5cc3383464c7a79e9b2b86e/pubchempy.py#L735'''
                        # substance_synonyms = substance.to_dict(properties=['synonyms'])['synonyms']
                        '''
                        substance.to_dict(properties=['synonyms']) return example:
                        {'synonyms': ['12259-21-1', 'Iron oxide (Fe2O3), hydrate', 'Ferric oxide hydrate', 
                                        'Ferrox', 'Hydrated ferric oxide', 'Hydrous ferric oxide', 
                                        'Iron oxide (Fe203), hydrate']}
                        '''

                        substance_synonyms = substance.synonyms  # https://github.com/mcs07/PubChemPy/blob/e3c4f4a9b6120433e5cc3383464c7a79e9b2b86e/pubchempy.py#L1095
                        '''
                        substance.synonyms' return example:
                            ['12259-21-1', 'Iron oxide (Fe2O3), hydrate', 'Ferric oxide hydrate', 
                            'Ferrox', 'Hydrated ferric oxide', 'Hydrous ferric oxide', 
                            'Iron oxide (Fe203), hydrate']
                        '''

                        # Check to make sure the substance has the same CAS#
                        if cas_nr in substance_synonyms:
                            sdf = pcp.get_sdf(identifier=substance.sid,
                                              namespace='sid',
                                              domain='substance')
                            # print(sdf)
                            if sdf:  # pcp.get_sdf return None if not found SDF
                                download_file.write_text(data=sdf)

                                # Check if the mol file is a binary string (some error during downloading) or empty mol file:
                                if is_binary_string(
                                        open(download_file, 'rb').read(1024)
                                ) or is_empty_mol_file(download_file):
                                    os.remove(download_file
                                              )  # remove the error mol file
                                else:
                                    return 0

            # If none of the Substances has the same CAS and/or has SDF (mol) file, then return the CAS #
            return cas_nr

    except Exception as error:
        # print('.', end='')
        if debug:
            print('Error during search structure in Pubchem:\n\t{}'.format(
                error))
        return cas_nr
Exemplo n.º 6
0
def pka_lookup_pubchem(identifier, namespace=None, domain='compound') -> Optional[str]:
    global debug

    if len(sys.argv) == 2 and sys.argv[1] in ['--debug=True', '--debug=true', '--debug', '-d']:
        debug = True

    # if debug:
    #     print(f'In DEBUG mode: {debug}')

    # Identify lookup source (Pubchem in this case)
    lookup_source = 'Pubchem'

    try:
        headers = {
            'user-agent': 'Mozilla/5.0 (X11; CentOS; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36'}

        # print('Searching Pubchem...')

        # Using pubchem api for python
        # Getting CID number, the result of this, by default is exact match. The result is returned as a list.
        cids = []
        identifier_type = ''

        if not namespace:
            identifier_type = classify(identifier)
            # print(f'identifier_type determined by classify() is: {identifier_type}')

            # If the input is inchi, inchikey or smiles (this could be a false smiles):
            if identifier_type in ['smiles', 'inchi', 'inchikey']:
                lookup = pcp.get_cids(identifier, namespace=identifier_type)
                if lookup:
                    cids.append(lookup[0])
            else:
                lookup = pcp.get_cids(identifier, namespace='name')
                if lookup:
                    cids.append(lookup[0])
                    # print(f'namespace from pubchem lookup is: {namespace}')
        elif namespace == 'cas':
            cids = pcp.get_cids(identifier, namespace='name')
        else:
            cids = pcp.get_cids(identifier, namespace=namespace)

        if not cids:
            lookup = pcp.get_cids(identifier, namespace='name')
            if lookup:
                cids.append(lookup[0])

            # cids = pcp.get_cids(identifier, namespace=namespace)
            identifier_type = namespace

        # print(cids)

        #  this api return an empty list if it cannot find cas_nr. This is to check if pubchem has this chemical.
        if len(cids) > 0:
            # if Pubchem found the result, get the first result of the list
            cid = cids[0]
            # print('Compound ID (CID) from PubChem is: {} and type is: {}'.format(cid, type(cid)))

            exact_match = True

            # synonyms = []
            synonyms = pcp.get_synonyms(cid)[0]['Synonym'] or []
            
            # Extract CAS number from the list of synonyms
            returned_cas = ''
            for synonym in synonyms:
                cas_nr = re.search(r'^\d{2,7}-\d{2}-\d$', synonym)
                if cas_nr:
                    cas_nr = cas_nr.group()
                    returned_cas = cas_nr
                    break

            # lookup_result = []
            lookup_result = pcp.get_properties(['inchi', 'inchikey',
                                        'canonical_smiles', 'isomeric_smiles',
                                        'iupac_name'],
                                cid)

            if identifier_type == 'cas':
                # To double check if the CAS number is correct:
                # using pubchem api, get a list of synonym. The result is a list of dict.
                # choose the first result and check all values for 'Synonym' key:
                exact_match = identifier in synonyms

            elif identifier_type in ['inchi', 'inchikey']:

                if identifier_type == 'inchi':
                    # print(lookup_result[0].get('InChI', False))
                    # print(f'input:\n{identifier}')
                    exact_match = (identifier == lookup_result[0].get('InChI', False))
                
                elif identifier_type == 'inchikey':
                    exact_match = (identifier == lookup_result[0].get('InChIKey', False))

            if not exact_match:
                if debug:
                    print(f'Exact match between input and Pubchem return value? {identifier in synonyms}')
                raise ValueError('This is not an exact match on Pubchem!')

            '''
            get url from Pubchem to get pka lookup result
            'XML' can be replaced with 'JSON' but it is harder to parse later on
            for more info about Pubchem output types: https://pubchemdocs.ncbi.nlm.nih.gov/pug-rest$_Toc494865558
            '''
            pka_lookup_result_xml = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/data/compound/{}/XML?heading=Dissociation+Constants'.format(cid)

            # Get the html request info using CID number from pubchem
            r = requests.get(pka_lookup_result_xml, headers=headers, timeout=15)
            # Check to see if give OK status (200) and not redirect
            if r.status_code == 200 and len(r.history) == 0:
                # print(r.text)
                # Use python XML to parse the return result
                tree = ET.fromstring(r.text)
            
                # Get the XML tree of <Information> only
                info_node = tree.find('.//*{http://pubchem.ncbi.nlm.nih.gov/pug_view}Information')

                # Get the pKa reference:
                original_source = info_node.find('{http://pubchem.ncbi.nlm.nih.gov/pug_view}Reference').text
                # Get the pKa result:
                pka_result = info_node.find('.//*{http://pubchem.ncbi.nlm.nih.gov/pug_view}String').text
                pka_result = re.sub(r'^pKa = ', '', pka_result)    # remove 'pka = ' part out of the string answer
                # print(pka_result)
                # print(original_source)
                # print(lookup_result)

                core_result = {
                    'source': lookup_source,
                    'Pubchem_CID': str(cid),
                    'pKa': pka_result,
                    'reference': original_source,
                    'Substance_CASRN': returned_cas,
                }
                extra_info = lookup_result[0]
                extra_info.pop('CID', None)    # Remove 'CID': ... from lookup_result[0]

                # Merge 2 dict: https://treyhunner.com/2016/02/how-to-merge-dictionaries-in-python/
                result = {**core_result, **extra_info}
                # Rename some keys in the dict
                s = pd.Series(result)
                s = s.rename({
                    'CanonicalSMILES': 'Canonical_SMILES',
                    'IsomericSMILES': 'Isomeric_SMILES',
                    'IUPACName': 'IUPAC_Name'
                })
                result = s.to_dict()            
                return result

            else:
                raise RuntimeError('pKa not found in Pubchem.')
    
        else:
            raise RuntimeError('Compound not found in Pubchem.')

    except Exception as error:
        if debug:
            traceback_str = ''.join(traceback.format_exception(etype=type(error), value=error, tb=error.__traceback__))
            print(traceback_str)

        return None
Exemplo n.º 7
0
def pcp_getter(drug, request = 'name'):
	p = pcp.get_synonyms(drug, request)
	if p:
		return ";".join(p[0]["Synonym"])
	else:
		return ""
Exemplo n.º 8
0
import pandas
import pubchempy as pcp

#gets pubchem ID 5090 and assigns it to c
c = pcp.Compound.from_cid(5090)

#gets the ID of Quercetin and assigns it to results
results = pcp.get_compounds('Quercetin', 'name')

print results

#get all synonyms for a given compound and assigns to Qsyn
Qsyn = pcp.get_synonyms('Quercetin', 'name')
Exemplo n.º 9
0
for line in keymaster:
	data = line.split(',',2)
	CMGName = data[0]
	key = str(data[1]).replace('\n', '')
	print "Getting cids list for " + CMGName + " with key " + key
	try:
		rr = requests.get('http://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/listkey/%s/cids/JSON'% key)
		cids = rr.json()['IdentifierList']['CID']
		# Split the big list of cids into smaller lists
		list_length = len(cids)
		batch_indexes = batchIndexes(list_length)
		findings = open('Results/%s.csv'% CMGName, 'a')
		cas_rns = []
		for index in batch_indexes:
			print "Processing pcp.get_synonyms with cids batch ", index
			results = pcp.get_synonyms(cids[index[0]:index[1]])
			print "Finding CASRN matches in the synonyms ..."
			for result in results:
				for syn in result.get('Synonym', []):
					match = re.match('(\d{2,7}-\d\d-\d)', syn)
		    		if match:
		        		cas_rns.append(match.group(1))
		print "Writing results to file ..."
		for element in cas_rns:
			findings.write(CMGName + ',' + element + '\n')
	except Exception as e:
		print "Checking " + CMGName + " throws error:"
		print e.message
		findings = open('Results/%s_error.txt'% CMGName, 'a')
		findings.write(CMGName + ": something is wrong.\nError message is: " + e.message + '\n')