示例#1
0
def pathwayInfo(code):
    # Function to get info about a pathway, from the code

    # Intialize searcher
    kSearcher = KEGG()
    # Get result and parse it in a dictionnary
    result = kSearcher.get(code)

    # Add code at the begining of the list
    dictResult = kSearcher.parse(result)

    # Initialize an empty list
    pathwayList = []

    # If name exist as a key in dictionnary, else 'NA' insted
    pathwayList.append(code)
    if 'NAME' in dictResult.keys():
        # If pathway name is a string comma separated, replace comma by semicolon
        # Fix to avoid wrong column formating at the end of the script
        nameStr = str(dictResult['NAME'][0].replace(',', ';'))
        pathwayList.append(nameStr)
    else:
        pathwayList.append('NA')

    # If class exist as a key in dictionnary, else 'NA' instead
    if 'CLASS' in dictResult.keys():
        # If pathway name is a string comma separated, replace comma by semicolon
        # Fix to avoid wrong column formating at the end of the script
        classStr = str(dictResult['CLASS']).replace(',', ';')
        pathwayList.append(classStr)
    else:
        pathwayList.append('NA')

    return pathwayList
示例#2
0
def get_kegg_info(stId):
    """
    Get kegg dict by pathway id.
    """
    k = KEGG()
    data = k.get(stId)
    dict_data = k.parse(data)
    return dict_data
示例#3
0
def get_genes_from_kegg_pathway(pathway):
    from bioservices.kegg import KEGG
    k = KEGG()
    k.organism = 'hsa'
    pathway = k.get(pathway)
    genes = k.parse(pathway)['GENE']
    entrez, symbol = zip(*[i.split('  ') for i in genes])
    return symbol
示例#4
0
def get_single_compound_metadata_online(compound_id):

    if compound_id.upper().startswith('C'):
        s = KEGG()
        res = s.get(compound_id)
        return s.parse(res)
    else:
        ch = ChEBI()
        res = ch.getCompleteEntity('CHEBI:'+compound_id)
        return res
示例#5
0
def extract_orthologs(filename):
    '''
    Create dictionnary with keggid as key and list of orthologs as value
        
        arg: csv with keggids
        return : dict with orthologs
    
    '''

    orthos_dict = {}
    k = KEGG()

    #get list of gammaproteobacteria from csv
    df = pd.read_csv(filename, sep="\t", tupleize_cols=1)
    df_gamma = pd.read_csv('gammaproteo.csv', sep="\t", tupleize_cols=1)
    gamma_list = df_gamma['KEGG'].tolist()

    #loop through keggid to get orthologs
    for keggid in df['kegg_id']:

        if keggid == "no":
            continue

        print(str(keggid))
        ortho_list = []

        #get orthologs on kegg
        data = k.get(keggid)
        dict_data = k.parse(data)

        if isinstance(dict_data, int):
            continue

    #loop through kegg orthologs data and verify that organisms are gammaproteobacteria
        for key, value in dict_data['GENES'].items():

            if key.lower() in gamma_list:
                # print(key.lower(), value.split('(')[0].split())
                para_num = len(value.split('(')[0].split())
                para_list = []

                for i in range(0, para_num):
                    #print(value.split('(')[0].split()[i])
                    para_list.append(key.lower() + ":" +
                                     value.split('(')[0].split()[i])

                ortho_list.append(para_list)

        orthos_dict[keggid] = ortho_list

    return orthos_dict
示例#6
0
 def id2seq(self, hsa):
     s = KEGG()
     d = s.get(hsa)
     dict_d = s.parse(d)
     pattern = re.compile(r'\s+')
     try:
         seq = re.sub(pattern, '', dict_d['AASEQ'])
     except:
         seq = ''
     #print('SEQ:', seq)
     text_file = open("dummy.txt", "w")
     text_file.write('>' + str(hsa) + '\n' + seq)
     text_file.close()
     return None
示例#7
0
def get_metabs(KEGG, reac_id):
    subs_list = []
    prod_list = []

    # Get reaction data from KEGG using a KEGG reaction ID
    r_data = KEGG.get(reac_id)
    # Parse the information retrieved
    r_parsed = KEGG.parse(r_data)
    # Split the equation into substrates and products
    split_eq = re.split('<=>', r_parsed['EQUATION'])
    # Remove the plus signs between the metabolites
    subs_list = [s.strip() for s in split_eq[0].split('+')]
    prod_list = [p.strip() for p in split_eq[1].split('+')]

    return [subs_list, prod_list]
示例#8
0
def get_compound_metadata_online(kegg_ids):

    s = KEGG()
    metadata_map = {}
    for i in range(len(kegg_ids)):
        try:
            if i % 10 == 0:
                print("Retrieving %d/%d KEGG records" % (i, len(kegg_ids)))
            kegg_id = kegg_ids[i]
            res = s.get(kegg_id)
            d = s.parse(res)
            first_name = d['NAME'][0]
            first_name = first_name.replace(';', '') # strip last ';' character
            metadata_map[kegg_id] = {'display_name': first_name}
        except TypeError:
            print('kegg_id=%s parsed_data=%s' % (kegg_id, d))
    return metadata_map
示例#9
0
def get_seq(filename):
    '''
    Create dictionnary with species as keys and sequences as values for an alignment
    
    arg: filename with gene name
    return: organism dictionnary with sequences
    '''
    
    k = KEGG()
    records = list(SeqIO.parse(os.path.join('alignments_nogaps/',filename), "fasta"))
    
    idlist = []
    orglist = [] 
    seqlist = []
    orgdict = {}
    
    #go through sequences and search for organism name on kegg
    for record in records:
        
        idsplit = (record.id).split('_',1)
        id = idsplit[0] + ':' + idsplit[1]
        
        handle  = k.get(id)
        if isinstance( handle, int ):
            print(id)
            continue
            
        org = k.parse(handle)['ORGANISM']
        org = org.split()
        org = org[1] +" "+ org[2]
        seqlist.append(list(str(record.seq)))
        orglist.append(org)
        idlist.append(id)

    duplist = set(orglist)
    
    # create dict with organism as key and sequences for organism as values
    for org in duplist:
        indices = [i for i, x in enumerate(orglist) if x == org]
        seqs = []
        for e in indices:
            seqs.append(seqlist[e])
        orgdict[org] = seqs
        
    #print(orgdict)
    return orgdict
def queryKegg(theIDs):
    print("Currently querying KEGG...")
    k = KEGG()
    keggData = list()
    IDlist = list()

    for id in theIDs:
        ids = id[3:]
        query = k.find("acb", ids)
        query = query.split('\t')
        finalQuery = query[0]
        data = k.get(finalQuery)
        dictData = k.parse(data)

        keggData.append(dictData)
        IDlist.append(ids)

    return keggData, IDlist
示例#11
0
def main():
    # Start KEGG interface
    k = KEGG()
    # Create a dict to store final result
    data = dict()

    # Read in KEGG gene ID & gene symbol pairs
    with open("hsa_gene_list.json", "r") as g:
        gene_data = json.load(g)

    for gene in gene_data.keys():
        print gene
        g_data = k.get(gene)
        g_prsd = k.parse(g_data)
        data[gene] = g_prsd

    with open('ginfo.json', 'w') as fw:
        json.dump(data, fw)
示例#12
0
    def get_reaction_ECs_from_kegg(self):
        self.reaction_ECs = defaultdict(set)

        kegg = KEGG()
        for r in self.model.reactions:
            ECs = []
            try:
                reacts = r.split(" ")
                for i in reacts:
                    if i not in self.reaction_ECs:
                        print("KEGG reaction", i)
                        ECs += kegg.parse(kegg.get(i))['ENZYME']
                        for e in ECs:
                            self.reaction_ECs[i].add(e)

            except Exception as inst:
                print(inst)
            #for e in ECs:
            #    self.reaction_ECs[r].add(e)

        print("EC data loaded from KEGG")
def mapSpecies(mousepeptrackfilename):
	#increase the field size of CSV
	csv.field_size_limit(int(ctypes.c_ulong(-1).value // 2))
	uniproturl = 'https://www.uniprot.org/uploadlists/'

	RETRY_TIME = 20.0
	mdf= pd.read_csv(mousepeptrackfilename, delimiter='\t')
	mouseGenes=list(mdf['Gene'].unique())
	mouseGenes=[g for g in mouseGenes if str(g) !='nan']

	mousehumandic={}
	for gx in range(0,len(mouseGenes),1000):
		genecodes=' '.join(mouseGenes[gx:gx+1000])
		geneuniprotparams = {
		'from':'GENENAME',
		'to':'ACC',
		'format':'tab',
		'query':genecodes,
		'columns':'id,genes(PREFERRED),organism-id,reviewed'

		}
		while True:
			try:
				geneuniprotdata = urllib.urlencode(geneuniprotparams)
				geneuniprotrequest = urllib2.Request(uniproturl, geneuniprotdata)
				geneuniprotresponse = urllib2.urlopen(geneuniprotrequest)
				for guniprotline in geneuniprotresponse:
					gudata=guniprotline.strip()
					if not gudata.startswith("Entry"):
						guinfo=gudata.split("\t")
						if '9606' == guinfo[2].lower() and 'reviewed' == guinfo[3].lower() and guinfo[-1].lower() ==guinfo[1].lower() and len(guinfo[0].strip())>1:
							mousehumandic[guinfo[-1].strip()]=guinfo[0].strip()
				break
			except urllib2.HTTPError:
				time.sleep(RETRY_TIME)
				print ('Hey, I am trying again until succeeds to get data from uniprot data!',str(datetime.datetime.now()))
			except httplib.BadStatusLine:
				time.sleep(RETRY_TIME)
				print ('Hey, I am trying again until succeeds to get data from  uniprot data!',str(datetime.datetime.now()))


	colname=['UniProtKB Accession','Protein','Gene','Organism','Peptide Sequence','Summary Concentration Range Data','All Concentration Range Data','All Concentration Range Data-Sample LLOQ Based','Peptide ID',\
	'Special Residues','Molecular Weight','GRAVY Score','Transitions','Retention Time','Analytical inofrmation',\
	'Gradients','AAA Concentration','CZE Purity','Panel','Knockout','LLOQ','ULOQ','Sample LLOQ','Protocol','Trypsin','QC. Conc. Data','Human UniProtKB Accession']

	finalresult=[]
	finalresult.append(colname)
	humanUniprotID=[]
	with open(mousepeptrackfilename) as csvfile:
		reader = csv.DictReader(csvfile, delimiter='\t')
		for row in reader:
			templist=[]
			for i in colname[:-1]:
				tempdata=str(row[i]).strip()
				templist.append(tempdata)
			if len(str(templist[2]).strip())>0:
				if templist[2] in mousehumandic:
					huUniId=mousehumandic[templist[2]]
					humanUniprotID.append(huUniId)
					templist.append(huUniId)
				else:
					templist.append('NA')

			finalresult.append(templist)

	with open(mousepeptrackfilename,'wb') as pf:
		pwriter =csv.writer(pf,delimiter='\t')
		pwriter.writerows(finalresult)

	unqhumanUniprotID=list(set(humanUniprotID))
	humanUniprotfuncinfodic={}
	countProt=0
	for subcode in unqhumanUniprotID:
		time.sleep(2)
		drugbanklist=[]
		PN='NA'
		GN='NA'
		OG='NA'
		OGID='NA'
		dislist=[]
		GoIDList=[]
		GoNamList=[]
		GoTermList=[]
		try:
			countProt+=1
			if countProt%1000 ==0:
				print str(countProt), "th protein Protein Name, Gene, Organism Name,drug bank data,disease data job starts",str(datetime.datetime.now())

			SGrequestURL="https://www.uniprot.org/uniprot/"+str(subcode)+".xml"
			SGunifile=urllib.urlopen(SGrequestURL)
			SGunidata= SGunifile.read()
			SGunifile.close()

			try:
				SGunidata=minidom.parseString(SGunidata)
				try:
					drugdata=(SGunidata.getElementsByTagName('dbReference'))
					for duItem in drugdata:
						if (duItem.attributes['type'].value).upper() == 'DRUGBANK':
							try:
								drugname=(str(duItem.getElementsByTagName('property')[0].attributes['value'].value).strip())
								drugid=str(duItem.attributes['id'].value).strip()
								durl='<a target="_blank" href="https://www.drugbank.ca/drugs/'+drugid+'">'+drugname+'</a>'
								drugbanklist.append(durl)
							except:
								pass
						if (duItem.attributes['type'].value).strip() == 'NCBI Taxonomy':
							try:
								OGID=str(duItem.attributes['id'].value).strip()
							except:
								pass
				except IndexError:
					pass

				try:
					godata=(SGunidata.getElementsByTagName('dbReference'))
					for gItem in godata:
						if (gItem.attributes['type'].value).upper() == 'GO':
							try:
								gonamedetails=(str(gItem.getElementsByTagName('property')[0].attributes['value'].value).strip()).split(':')[1]
								gotermdetails=(str(gItem.getElementsByTagName('property')[0].attributes['value'].value).strip()).split(':')[0]
								GoNamList.append(gonamedetails)
								goid=str(gItem.attributes['id'].value).strip()
								GoIDList.append(goid)
								if gotermdetails.lower()=='p':
									GoTermList.append('Biological Process')
								if gotermdetails.lower()=='f':
									GoTermList.append('Molecular Function')
								if gotermdetails.lower()=='c':
									GoTermList.append('Cellular Component')
							except:
								pass

						if (gItem.attributes['type'].value).strip() == 'NCBI Taxonomy':
							try:
								OGID=str(gItem.attributes['id'].value).strip()
							except:
								pass
				except IndexError:
					pass

				try:
					try:
						PN=(((SGunidata.getElementsByTagName('protein')[0]).getElementsByTagName('recommendedName')[0]).getElementsByTagName('fullName')[0]).firstChild.nodeValue

					except:
						PN=(((SGunidata.getElementsByTagName('protein')[0]).getElementsByTagName('submittedName')[0]).getElementsByTagName('fullName')[0]).firstChild.nodeValue

				except IndexError:
					pass

				try:
					try:
						GN=((SGunidata.getElementsByTagName('gene')[0]).getElementsByTagName('name')[0]).firstChild.nodeValue
					except:
						GN='NA'
				except IndexError:
					pass

				try:
					try:
						OG=((SGunidata.getElementsByTagName('organism')[0]).getElementsByTagName('name')[0]).firstChild.nodeValue
					except:
						OG='NA'
				except IndexError:
					pass

				try:
					disdata=SGunidata.getElementsByTagName('disease')
					for dItem in disdata:
						disname=''
						disshort=''
						try:
							disname=(dItem.getElementsByTagName('name')[0]).firstChild.nodeValue
						except:
							pass
						try:
							disshort=(dItem.getElementsByTagName('acronym')[0]).firstChild.nodeValue
						except:
							pass
						if len(disname.strip())>0:
							dislist.append(str(disname.strip())+'('+str(disshort)+')')
				except IndexError:
					pass

			except ExpatError:
				pass
		except IOError:
			pass
		drugbankdata='NA'
		disdata='NA'
		goiddata='NA'
		gonamedata='NA'
		gotermdata='NA'
		if len(GoIDList)>0:
			goiddata='|'.join(list(set(GoIDList)))
		if len(GoNamList)>0:
			gonamedata='|'.join(list(set(GoNamList)))
		if len(GoTermList)>0:
			gotermdata='|'.join(list(set(GoTermList)))
		if len(drugbanklist)>0:
			drugbankdata='|'.join(list(set(drugbanklist)))
		if len(dislist)>0:
			disdata='|'.join(list(set(dislist)))

		humanUniprotfuncinfodic[subcode]=[PN,GN,OG,OGID,disdata,drugbankdata,goiddata,gonamedata,gotermdata]

	hudicfile='humanUniprotfuncinfodic.obj'
	hudicf = open(hudicfile, 'wb')
	pickle.dump(humanUniprotfuncinfodic, hudicf , pickle.HIGHEST_PROTOCOL)
	hudicf.close()	

	print ("Extracting KEGG pathway name, job starts",str(datetime.datetime.now()))
	hkeggdictfile={}
	hk = KEGG()
	for hkx in range(0,len(unqhumanUniprotID),2000):
		countProt+=hkx+2000
		if countProt%2000 ==0:
			print (str(countProt), "th protein kegg job starts",str(datetime.datetime.now()))

		huniprotcodes=' '.join(unqhumanUniprotID[hkx:hkx+2000])
		huniprotparams = {
		'from':'ACC',
		'to':'KEGG_ID',
		'format':'tab',
		'query':huniprotcodes
		}
		
		while True:
			try:
				hkuniprotdata = urllib.urlencode(huniprotparams)
				hkuniprotrequest = urllib2.Request(uniproturl, hkuniprotdata)
				hkuniprotresponse = urllib2.urlopen(hkuniprotrequest)
				for hkuniprotline in hkuniprotresponse:
					hkudata=hkuniprotline.strip()
					if not hkudata.startswith("From"):
						hkuinfo=hkudata.split("\t")
						if len(hkuinfo[1].strip()):
							hkegg=hk.get(hkuinfo[1].strip())
							hkudict_data = hk.parse(hkegg)
							try:
								try:
									if len(str(hkuinfo[0]).strip()) >5:
										hkeggdictfile[hkuinfo[0].strip()]=hkudict_data['PATHWAY'].values()
								except TypeError: 
									pass
							except KeyError:
								pass
				break
			except urllib2.HTTPError:
				time.sleep(RETRY_TIME)
				print ('Hey, I am trying again until succeeds to get data from KEGG!',str(datetime.datetime.now()))
				pass

	hkdicfile='humankeggdic.obj'
	hkdicf = open(hkdicfile, 'wb')
	pickle.dump(hkeggdictfile, hkdicf , pickle.HIGHEST_PROTOCOL)
	hkdicf.close()
示例#14
0
from bioservices.kegg import KEGG

k = KEGG()
path = k.get("K00855")
kdict = k.parse(path)
print(kdict)
help(kdict)
with open("play.out", "wt") as result:
	result.write("\n".join(kdict.keys()))
示例#15
0
                    ko
                )  #here append may be used, bc only one item is added at a time
            else:
                EC2KO_dic[item] = [ko]
    else:
        KO_dic[ko] = [definition]

#here a list of KOs is made to be searched for - if pathway is defined
kostoget = {}

if args.pathway != 'none':
    output = open('eclist.txt', 'w')

    kegg = KEGG()
    pathway = kegg.get(args.pathway)
    dict_data = kegg.parse(pathway)
    if dict_data == 404:
        print("WARNING: BAD PATHWAY SUBMITTED TO KEGG!")
    elif dict_data == None:
        print("WARNING: ERROR CONNECTING TO KEGG SERVER!")
    #print(dict_data)
    print("ECQUERY: PROCESSING PATHWAY/MODULE")
    try:
        for key in dict_data['ORTHOLOGY'].keys():
            print("adding ortholog {} to eclist".format(key))
            value = dict_data['ORTHOLOGY'][key]
            print(value)
            if "," in key:
                for item in key.split(","):
                    kostoget[item] = value
                    output.write(item + '\n')
示例#16
0
from bioservices.kegg import KEGG
kegg = KEGG()
pathway = kegg.get("ko01230")
dict_data = kegg.parse(pathway)
print(dict_data)
output = open("modules.txt", "w")
modules_dict = {}
for key in dict_data['MODULE'].keys():
    pathway = kegg.get(key)
    module_data = kegg.parse(pathway)
    #print(module_data)
    orthologs = []
    for ortholog in module_data['ORTHOLOGY'].keys():
        data = [ortholog, module_data['ORTHOLOGY'][ortholog]]
        orthologs.append("_".join(data))
        modules_dict[ortholog] = key
    output.write('{}\t{}\t{}\t{}\n'.format(key, module_data['NAME'],
                                           module_data['DEFINITION'],
                                           "//".join(orthologs)))
示例#17
0
def enzymeInfo(code, ignored, stats, verbosity):
    # Function to get info about an enzyme, from the code
    # This function return a double list

    # Intialize KEGG searcher
    kSearch = KEGG(verbose=verbosity)

    # Get result and parse it in a dictionnary
    print(f"[+] Get info about enzyme {code}")
    result = kSearch.get(code)

    # If KEGG return an int, the enzyme code doesn't match in databases
    if type(result) is int:
        return False
    else:

        dictResult = kSearch.parse(result)

        # Create prefix list, info about enzyme herself
        prefixList = []

        # Add code at the begining of the list
        prefixList.append(code)

        # If name is present as key, else 'NA' insted
        if 'NAME' in dictResult.keys():
            #prefixList.append(dictResult['NAME'])
            # Convert names from list into a string
            # with strop '[]' part, and replace initial separator , by ;
            namesStr = str(dictResult['NAME']).strip("'[]'").replace(',', ';')
            prefixList.append(namesStr)
        else:
            prefixList.append('NA')

        # If definition is present as key, else 'NA' insted
        if 'DEFINITION' in dictResult.keys():
            # If definition is a string comma separated, replace comma by semicolon
            # Fix to avoid wrong column formating at the end of the script
            definitionStr = str(dictResult['DEFINITION']).replace(',', ';')
            prefixList.append(definitionStr)
        else:
            prefixList.append('NA')

        # If pathway exist as a key in result
        if 'PATHWAY' in dictResult.keys():

            # Get all pathways as keys in dictionnary
            pathwayList = list(dictResult['PATHWAY'].keys())

            # Create final list, which contain :
            # - prefix (info about enzyme)
            # - suffix list (info about each enzyme's pathways)
            finalList = []

            # Add suffix in final list
            finalList.append(prefixList)

            for pathway in pathwayList:
                # If pathway not in ignored list
                if pathway not in ignored:
                    print(f"  [-] Get info about {pathway} pathway")
                    suffixList = pathwayInfo(pathway)

                    # Add number of pathway for stats
                    stats['NB_PATHWAY'] = stats['NB_PATHWAY'] + 1

                    # Add suffix of pathway in final list
                    finalList.append(suffixList)

                # If enzyme have only 1 pathway and this pathway is in ignored list
                # Bad luck !
                elif len(pathwayList) == 1 and pathway in ignored:
                    print(
                        f"  [!] Enzyme {code} have only 1 pathway : {pathway}")
                    print(f"  [!] and this pathway is ignored")
                    # Add entries for stats
                    stats['ENZYME_ONLY_IGNORED_PATHWAY'] = stats[
                        'ENZYME_ONLY_IGNORED_PATHWAY'] + 1
                    stats['LIST_ENZYME_ONLY_IGNORED_PATHWAY'].append(code)
                    # Artificially create pathway entry, but empty
                    suffixList = ['NA']
                    # Add suffix of pathway in final list
                    finalList.append(suffixList)

                else:
                    print(f"  [!] Ignored pathway : {pathway}")

        # Else, if pathway doesn't exist as a key in result
        elif 'PATHWAY' not in dictResult.keys():

            # Initialize an empty list
            finalList = []
            # Add suffix in final list
            finalList.append(prefixList)
            # Display a alert message
            print(f"[!] No pathway detected for enzyme {code}\n")
            # Increment number of failed pathway in stats and add enzyme in list
            stats['MISSING_PATHWAY_IN_KEGG'] = stats[
                'MISSING_PATHWAY_IN_KEGG'] + 1
            stats['LIST_MISSING_PATHWAY_IN_KEGG'].append(code)
            # Artificially create pathway entry, but empty
            suffixList = ['NA']
            # Add suffix of pathway in final list
            finalList.append(suffixList)

        return finalList
def mapSpecies(mousepeptrackfilename):
    RETRY_TIME = 20.0
    mouseTohumanfilepath = os.path.join(os.getcwd(), 'MouseToHuman.tsv')
    print("Extracting Mouse to Human Map data, job starts",
          str(datetime.datetime.now()))
    #increase the field size of CSV
    csv.field_size_limit(int(ctypes.c_ulong(-1).value // 2))
    try:
        urllib.urlretrieve(
            'http://www.informatics.jax.org/downloads/reports/HOM_MouseHumanSequence.rpt',
            mouseTohumanfilepath)
        urllib.urlcleanup()
    except:
        print("Can't able to download MouseToHuman.tsv file!!")

    colnameMousHu = [
        'HomoloGene ID', 'Common Organism Name', 'NCBI Taxon ID', 'Symbol',
        'EntrezGene ID', 'Mouse MGI ID', 'HGNC ID', 'OMIM Gene ID',
        'Genetic Location', 'Genomic Coordinates (mouse: , human: )',
        'Nucleotide RefSeq IDs', 'Protein RefSeq IDs', 'SWISS_PROT IDs'
    ]

    mouseHumandata = []
    homologID = []
    with open(mouseTohumanfilepath) as mhtsvfile:
        mhreader = csv.DictReader(mhtsvfile, delimiter='\t')
        for mhrow in mhreader:
            mhtemplist = []
            for i in colnameMousHu:
                mhtempdata = str(mhrow[i]).strip()
                mhtemplist.append(mhtempdata)
            if len(mhtemplist[-1].strip()) > 0:
                homologID.append(mhtemplist[0])
                mouseHumandata.append(mhtemplist)
    homologID = list(set(homologID))
    homologID.sort()

    mousehumandic = {}
    for homologidItem in homologID:
        tempHumanHomoUniID = ''
        tempMouseHomoUniID = ''
        for item in mouseHumandata:
            if homologidItem == item[0]:
                if 'mouse' in item[1].strip().lower():
                    tempMouseHomoUniID = item[-1].strip()
                else:
                    tempHumanHomoUniID = item[-1].strip()
        if len(tempMouseHomoUniID.strip()) > 0 and len(
                tempHumanHomoUniID.strip()) > 0 and tempHumanHomoUniID.strip(
                ).upper() != 'NA':
            mousehumandic[tempMouseHomoUniID] = tempHumanHomoUniID

    colname=['UniProtKB Accession','Protein','Gene','Organism','Peptide Sequence','Summary Concentration Range Data','All Concentration Range Data','All Concentration Range Data-Sample LLOQ Based','Peptide ID',\
    'Special Residues','Molecular Weight','GRAVY Score','Transitions','Retention Time','Analytical inofrmation',\
    'Gradients','AAA Concentration','CZE Purity','Panel','Knockout','LLOQ','ULOQ','Sample LLOQ','Protocol','Trypsin','QC. Conc. Data','Human UniProtKB Accession']

    finalresult = []
    finalresult.append(colname)
    humanUniprotID = []
    with open(mousepeptrackfilename) as csvfile:
        reader = csv.DictReader(csvfile, delimiter='\t')
        for row in reader:
            templist = []
            for i in colname[:-1]:
                tempdata = str(row[i]).strip()
                templist.append(tempdata)
            if len(str(templist[0]).strip()) > 0:
                if templist[0].split('-')[0] in mousehumandic:
                    humanUniprotID.append(
                        mousehumandic[templist[0].split('-')[0]])
                    templist.append(mousehumandic[templist[0].split('-')[0]])
                else:
                    templist.append('NA')

            finalresult.append(templist)

    with open(mousepeptrackfilename, 'wb') as pf:
        pwriter = csv.writer(pf, delimiter='\t')
        pwriter.writerows(finalresult)

    disGenDataDicName = disGenData()
    #disGenDataDicName='disGen.obj'
    disGenDataDic = cPickle.load(open(disGenDataDicName, 'rb'))
    unqhumanUniprotID = list(set(humanUniprotID))
    humanUniprotfuncinfodic = {}
    countProt = 0
    for subcode in unqhumanUniprotID:
        time.sleep(2)
        drugbanklist = []
        PN = 'NA'
        GN = 'NA'
        OG = 'NA'
        OGID = 'NA'
        dislist = []
        unidislist = []
        unidisURLlist = []
        disgendislist = []
        disgendisURLlist = []
        GoIDList = []
        GoNamList = []
        GoTermList = []
        GOinfo = []
        try:
            countProt += 1
            if countProt % 1000 == 0:
                print str(
                    countProt
                ), "th protein Protein Name, Gene, Organism Name,drug bank data,disease data job starts", str(
                    datetime.datetime.now())

            SGrequestURL = "https://www.uniprot.org/uniprot/" + str(
                subcode) + ".xml"
            SGunifile = urllib.urlopen(SGrequestURL)
            SGunidata = SGunifile.read()
            SGunifile.close()

            try:
                SGunidata = minidom.parseString(SGunidata)
                try:
                    drugdata = (SGunidata.getElementsByTagName('dbReference'))
                    for duItem in drugdata:
                        if (duItem.attributes['type'].value
                            ).upper() == 'DRUGBANK':
                            try:
                                drugname = (str(
                                    duItem.getElementsByTagName('property')
                                    [0].attributes['value'].value).strip())
                                drugid = str(
                                    duItem.attributes['id'].value).strip()
                                durl = '<a target="_blank" href="https://www.drugbank.ca/drugs/' + drugid + '">' + drugname + '</a>'
                                drugbanklist.append(durl)
                            except:
                                pass
                        if (duItem.attributes['type'].value
                            ).strip() == 'NCBI Taxonomy':
                            try:
                                OGID = str(
                                    duItem.attributes['id'].value).strip()
                            except:
                                pass
                except IndexError:
                    pass

                try:
                    godata = (SGunidata.getElementsByTagName('dbReference'))
                    for gItem in godata:
                        if (gItem.attributes['type'].value).upper() == 'GO':
                            try:
                                gonamedetails = (str(
                                    gItem.getElementsByTagName('property')
                                    [0].attributes['value'].value).strip()
                                                 ).split(':')[1]
                                gotermdetails = (str(
                                    gItem.getElementsByTagName('property')
                                    [0].attributes['value'].value).strip()
                                                 ).split(':')[0]
                                GoNamList.append(gonamedetails)
                                goid = str(
                                    gItem.attributes['id'].value).strip()
                                GoIDList.append(goid)
                                tempGoTerm = None

                                if gotermdetails.lower() == 'p':
                                    tempGoTerm = 'Biological Process'
                                if gotermdetails.lower() == 'f':
                                    tempGoTerm = 'Molecular Function'
                                if gotermdetails.lower() == 'c':
                                    tempGoTerm = 'Cellular Component'
                                GoTermList.append(tempGoTerm)
                                tempGOData = gonamedetails + ';' + goid + ';' + tempGoTerm
                                GOinfo.append(tempGOData)
                            except:
                                pass

                        if (gItem.attributes['type'].value
                            ).strip() == 'NCBI Taxonomy':
                            try:
                                OGID = str(
                                    gItem.attributes['id'].value).strip()
                            except:
                                pass
                except IndexError:
                    pass

                try:
                    try:
                        PN = (((SGunidata.getElementsByTagName('protein')[0]
                                ).getElementsByTagName('recommendedName')[0]
                               ).getElementsByTagName('fullName')[0]
                              ).firstChild.nodeValue

                    except:
                        PN = (((SGunidata.getElementsByTagName('protein')[0]
                                ).getElementsByTagName('submittedName')[0]
                               ).getElementsByTagName('fullName')[0]
                              ).firstChild.nodeValue

                except IndexError:
                    pass

                try:
                    try:
                        GN = ((
                            SGunidata.getElementsByTagName('gene')[0]
                        ).getElementsByTagName('name')[0]).firstChild.nodeValue
                    except:
                        GN = 'NA'
                except IndexError:
                    pass

                try:
                    try:
                        OG = ((
                            SGunidata.getElementsByTagName('organism')[0]
                        ).getElementsByTagName('name')[0]).firstChild.nodeValue
                    except:
                        OG = 'NA'
                except IndexError:
                    pass

                try:
                    disdata = SGunidata.getElementsByTagName('disease')
                    for dItem in disdata:
                        disname = ''
                        disshort = ''
                        disURL = ''
                        disID = ''
                        try:
                            disname = (dItem.getElementsByTagName('name')[0]
                                       ).firstChild.nodeValue
                            disID = (dItem.attributes['id'].value).upper()
                        except:
                            pass
                        try:
                            disshort = (dItem.getElementsByTagName('acronym')
                                        [0]).firstChild.nodeValue
                        except:
                            pass
                        if len(disname.strip()) > 0:
                            disURL = '<a target="_blank" href="https://www.uniprot.org/diseases/' + disID + '">' + str(
                                disname.strip()) + '(' + str(
                                    disshort) + ')' + '</a>'
                            dislist.append(
                                str(disname.strip()) + '(' + str(disshort) +
                                ')')
                            unidislist.append(
                                str(disname.strip()) + '(' + str(disshort) +
                                ')')
                            unidisURLlist.append(disURL)
                except IndexError:
                    pass

            except ExpatError:
                pass
        except IOError:
            pass
        drugbankdata = 'NA'
        disdata = 'NA'
        uniDisData = 'NA'
        uniDisURLData = 'NA'
        disgenDisData = 'NA'
        disgenDisURLData = 'NA'
        goiddata = 'NA'
        gonamedata = 'NA'
        gotermdata = 'NA'
        goData = 'NA'
        if GN != 'NA' and GN in disGenDataDic:
            disgendislist = disGenDataDic[GN][0]
            disgendisURLlist = disGenDataDic[GN][1]
            if len(dislist) > 0:
                dislist = dislist + disGenDataDic[GN][0]
            else:
                dislist = disGenDataDic[GN][0]

        if len(GoIDList) > 0:
            goiddata = '|'.join(list(set(GoIDList)))
        if len(GoNamList) > 0:
            gonamedata = '|'.join(list(set(GoNamList)))
        if len(GoTermList) > 0:
            gotermdata = '|'.join(list(set(GoTermList)))
        if len(GOinfo) > 0:
            goData = '|'.join(list(set(GOinfo)))
        if len(drugbanklist) > 0:
            drugbankdata = '|'.join(list(set(drugbanklist)))
        if len(dislist) > 0:
            disdata = '|'.join(list(set(dislist)))
        if len(unidislist) > 0:
            uniDisData = '|'.join(list(set(unidislist)))
        if len(unidisURLlist) > 0:
            uniDisURLData = '|'.join(list(set(unidisURLlist)))
        if len(disgendislist) > 0:
            disgenDisData = '|'.join(list(set(disgendislist)))
        if len(disgendisURLlist) > 0:
            disgenDisURLData = '|'.join(list(set(disgendisURLlist)))
        humanUniprotfuncinfodic[subcode] = [
            PN, GN, OG, OGID, disdata, uniDisData, uniDisURLData,
            disgenDisData, disgenDisURLData, drugbankdata, goiddata,
            gonamedata, gotermdata, goData
        ]
    hudicfile = 'humanUniprotfuncinfodic.obj'
    hudicf = open(hudicfile, 'wb')
    pickle.dump(humanUniprotfuncinfodic, hudicf, pickle.HIGHEST_PROTOCOL)
    hudicf.close()

    print("Extracting KEGG pathway name, job starts",
          str(datetime.datetime.now()))
    hkeggdictfile = {}
    huniproturl = 'https://www.uniprot.org/uploadlists/'
    hk = KEGG()
    for hkx in range(0, len(unqhumanUniprotID), 2000):
        countProt += hkx + 2000
        if countProt % 2000 == 0:
            print(str(countProt), "th protein kegg job starts",
                  str(datetime.datetime.now()))

        huniprotcodes = ' '.join(unqhumanUniprotID[hkx:hkx + 2000])
        huniprotparams = {
            'from': 'ACC',
            'to': 'KEGG_ID',
            'format': 'tab',
            'query': huniprotcodes
        }

        while True:
            try:
                hkuniprotdata = urllib.urlencode(huniprotparams)
                hkuniprotrequest = urllib2.Request(huniproturl, hkuniprotdata)
                hkuniprotresponse = urllib2.urlopen(hkuniprotrequest)
                for hkuniprotline in hkuniprotresponse:
                    hkudata = hkuniprotline.strip()
                    if not hkudata.startswith("From"):
                        hkuinfo = hkudata.split("\t")
                        if len(hkuinfo[1].strip()):
                            hkegg = hk.get(hkuinfo[1].strip())
                            hkudict_data = hk.parse(hkegg)
                            try:
                                try:
                                    if len(str(hkuinfo[0]).strip()) > 5:
                                        tempkeggData = '|'.join(
                                            '{};{}'.format(key, value)
                                            for key, value in
                                            hkudict_data['PATHWAY'].items())
                                        hkeggdictfile[hkuinfo[0].strip()] = [
                                            hkudict_data['PATHWAY'].values(),
                                            tempkeggData
                                        ]
                                except TypeError:
                                    pass
                            except KeyError:
                                pass
                break
            except urllib2.HTTPError:
                time.sleep(RETRY_TIME)
                print(
                    'Hey, I am trying again until succeeds to get data from KEGG!',
                    str(datetime.datetime.now()))
                pass

    hkdicfile = 'humankeggdic.obj'
    hkdicf = open(hkdicfile, 'wb')
    pickle.dump(hkeggdictfile, hkdicf, pickle.HIGHEST_PROTOCOL)
    hkdicf.close()
示例#19
0
def get_kegg_info(stId):
    k = KEGG()
    data = k.get(stId)
    dict_data = k.parse(data)
    return dict_data