def updateSMILES(self, name_table="chemicals"): """Function use to update the chemical table => check if smiles origin change""" d_dsstox_SMILES = toolbox.loadMatrixToDict(self.p_chem_SMILES, sep=",") d_dsstox_name = toolbox.loadMatrixToDict(self.p_chem_name) self.pr_desc = pathFolder.createFolder(self.pr_out + "DESC/") #extract chemical without DTXSID # see if chem included cmd_SQL = "SELECT id, dsstox_id, smiles_origin, inchikey, smiles_clean FROM %s " % ( name_table) l_chem_DB = self.cDB.execCMD(cmd_SQL) d_chem_DB = {} for chem_DB in l_chem_DB: d_chem_DB[chem_DB[1]] = [ chem_DB[0], chem_DB[2], chem_DB[3], chem_DB[4] ] i = 0 for chem in d_dsstox_SMILES.keys(): dsstox_id = d_dsstox_SMILES[chem]["dsstox_substance_id"] inchkey = d_dsstox_SMILES[chem]["InChI Key_QSARr"] smiles = d_dsstox_SMILES[chem]["Original_SMILES"] smiles_cleaned = d_dsstox_SMILES[chem]["Canonical_QSARr"] try: smiles_indb = d_chem_DB[dsstox_id][ 1] # case of chemical is not in the DB except: continue inchkey_db = d_chem_DB[dsstox_id][2] smiles_cleaned_db = d_chem_DB[dsstox_id][3] smiles_db = d_chem_DB[dsstox_id][1] if smiles != smiles_db: # recompute cleaned SMILES c_chem = CompDesc.CompDesc(smiles, self.pr_desc) c_chem.prepChem() if c_chem.err == 0: c_chem.generateInchiKey() else: c_chem.smi = None if c_chem.err == 0: inchikey = c_chem.inchikey else: inchikey = None if d_chem_DB[dsstox_id][2] != inchikey: cmd_sql = "UPDATE %s SET smiles_origin = '%s', smiles_clean = '%s', inchikey='%s' WHERE id='%s';" % ( name_table, smiles, c_chem.smi, inchikey, d_chem_DB[dsstox_id][0]) else: continue #cmd_sql = "UPDATE %s SET smiles_origin = '%s' WHERE id='%s';"%(name_table, smiles, d_chem_DB[dsstox_id][0]) #print(smiles_cleaned,smiles_indb, dsstox_id) print(i) i = i + 1 self.cDB.updateTable(cmd_sql) return
def pushNeighbors(self): prneighbor = pathFolder.createFolder(self.prout + "Neighbors/") ptable3Dim = prneighbor + "Table_DIM1D2D-2_1.csv" ptableNDim = prneighbor + "Table_DIM1D2D-170_207.csv" if path.exists(ptable3Dim) and path.exists(ptableNDim): ddist3D = toolbox.loadMatrixToDict(ptable3Dim) for chem in ddist3D.keys(): ddist3D[chem] = ddist3D[chem]["Neighbors"].split(" ") ddistND = toolbox.loadMatrixToDict(ptableNDim) for chem in ddistND.keys(): ddistND[chem] = ddistND[chem]["Neighbors"].split(" ") cDB = DBrequest.DBrequest() cDB.verbose = 0 for chem in ddist3D.keys(): # print(chem) #out1D2D = cDB.getRow("%s_neighbors"%(self.nameMap), "inchikey='%s'" % (chem)) out1D2D = [] if out1D2D == []: w3D = "{" + ",".join( ["\"%s\"" % (neighbor) for neighbor in ddist3D[chem]]) + "}" wND = "{" + ",".join( ["\"%s\"" % (neighbor) for neighbor in ddistND[chem]]) + "}" cDB.addElement( "%s_neighbors" % (self.nameMap), ["inchikey", "neighbors_dim3", "neighbors_dimn"], [chem, w3D, wND])
def updateMissingDTXSID(self, name_table): """Check if we can populate chemicals with no DTXSID with new update""" d_dsstox_SMILES = toolbox.loadMatrixToDict(self.p_chem_SMILES, sep=",") d_dsstox_name = toolbox.loadMatrixToDict(self.p_chem_name) #extract chemical without DTXSID # see if chem included cmd_SQL = "SELECT id, smiles_origin FROM %s WHERE dsstox_id is null" % ( name_table) l_chem_DB = self.cDB.execCMD(cmd_SQL) d_chem_DB = {} for chem_DB in l_chem_DB: print(chem_DB) d_chem_DB[chem_DB[1]] = chem_DB[0] for chem in d_dsstox_SMILES.keys(): smiles = d_dsstox_SMILES[chem]["Original_SMILES"] try: id_chem = d_chem_DB[smiles] # update chemical cmd_sql = "UPDATE %s SET casn = '%s', name = '%s', dsstox_id='%s' WHERE id='%s';" % ( name_table, d_dsstox_SMILES[chem]["casrn"], d_dsstox_name[d_dsstox_SMILES[chem]["dsstox_substance_id"]] ["preferred_name"].replace("'", "''"), d_dsstox_SMILES[chem]["dsstox_substance_id"], id_chem) print(cmd_sql) self.cDB.updateTable(cmd_sql) except: continue
def extractOnlyNewChem(self, name_table, field_comparison): pr_out = pathFolder.createFolder(self.pr_out + "updateDSSTOX/") p_filout = pr_out + "chem_list.txt" if path.exists(p_filout): filout = open(p_filout, "r") self.l_chem_toadd = filout.read().split("\n") filout.close() return filout = open(p_filout, "w") d_dsstox_name = toolbox.loadMatrixToDict(self.p_chem_name) d_dsstox_SMILES = toolbox.loadMatrixToDict(self.p_chem_SMILES, sep=",") # extract list of chemicals in the DB l_chem_DB = self.cDB.execCMD("SELECT %s FROM %s" % (field_comparison, name_table)) for chem_DB in l_chem_DB: chem = chem_DB[0] if chem == None: continue try: del d_dsstox_name[chem] except: pass for chem in d_dsstox_name.keys(): filout.write(chem + "\n") filout.close() self.l_chem_toadd = list(d_dsstox_name.keys())
def plotDensity(self, paff, ltypeaff, nbframeselect, prMDdesc, prout): # create matrix with conformer if not "paff" in self.__dict__: self.paff = paff daff = toolbox.loadMatrixToDict(paff) self.daff = daff dID = {} for IDChEMBL in self.daff.keys(): if self.daff[IDChEMBL]["Type"] in ltypeaff: dID[IDChEMBL] = deepcopy(self.daff[IDChEMBL]) pMasterMatrix = prout + "masterM_" + "-".join(ltypeaff) + "_" + str( nbframeselect) if not path.exists(pMasterMatrix): fmasterMatrix = open(pMasterMatrix, "w") # header pdescbyFrameh = prMDdesc + dID.keys()[0] + "/Ligbyframe" dh = toolbox.loadMatrixToDict(pdescbyFrameh) lh = dh[dh.keys()[0]].keys() lh.remove("Frame") fmasterMatrix.write("ID\t" + "\t".join(lh) + "\n") # create masterMatrix i = 0 for IDChEMBL in dID.keys(): print IDChEMBL, i pdescbyFrame = prMDdesc + IDChEMBL + "/Ligbyframe" dframe = toolbox.loadMatrixToDict(pdescbyFrame) nbframe = len(dframe.keys()) if nbframe != 1501: print IDChEMBL, "Error" i += 1 continue step = int(nbframe / nbframeselect) #print step i = 0 while i < nbframe: print i idw = str(IDChEMBL) + "_" + str(i) stframe = str(i) while len(str(stframe)) < 5: stframe = "0" + str(stframe) stframe = "LGD_" + stframe print stframe, idw fmasterMatrix.write( "%s\t%s\n" % (idw, "\t".join([str(dframe[stframe][h]) for h in lh]))) i = i + step fmasterMatrix.close()
def computeCoords(self, corVal, distributionVal, insertDB=1): if not "p1D2D" in self.__dict__ and not "p3D" in self.__dict__: self.computeDesc(insertDB=0) err = 0 # create coords prmap = pathFolder.createFolder(self.prout + "map_" + str(corVal) + "-" + str(distributionVal) + "/") pcoordDim1Dim2 = prmap + "coord1D2D.csv" pcoordDim3D = prmap + "coord3D.csv" if path.exists(pcoordDim1Dim2) and path.exists(pcoordDim3D): self.pcoords1D2D = pcoordDim1Dim2 self.pcoords3D = pcoordDim3D elif not path.exists(pcoordDim1Dim2) or not path.exists(pcoordDim3D): runExternalSoft.RComputeMapFiles(self.p1D2D, self.p3D, prmap, corVal, distributionVal) elif not path.exists(pcoordDim1Dim2) or not path.exists(pcoordDim3D): print("ERROR file map") err = 1 self.pcoords1D2D = pcoordDim1Dim2 self.pcoords3D = pcoordDim3D if insertDB == 1 and err == 0: dcoord1D2D = toolbox.loadMatrixToDict(pcoordDim1Dim2, sep=",") dcoord3D = toolbox.loadMatrixToDict(pcoordDim3D, sep=",") cDB = DBrequest.DBrequest() cDB.verbose = 0 for chem in dcoord1D2D.keys(): #print(chem) #out1D2D = cDB.getRow("drugbank_coords", "inchikey='%s'" % (chem)) out1D2D = [] if out1D2D == []: nbdim1d2d = len(dcoord1D2D[chem].keys()) - 1 nbdim3d = len(dcoord3D[chem].keys()) - 1 w1D2D = "{" + ",".join([ "\"%s\"" % (dcoord1D2D[chem]["DIM" + str(i)]) for i in range(1, nbdim1d2d + 1) ]) + "}" w3D = "{" + ",".join([ "\"%s\"" % (dcoord3D[chem]["DIM3-" + str(i)]) for i in range(1, nbdim3d + 1) ]) + "}" cDB.addElement( "drugbank_coords", ["inchikey", "dim1d2d", "dim3d", "indrugbank"], [chem, w1D2D, w3D, "True"])
def pushTablePropAllInDB(self): if not "pTableInAll" in self.__dict__: print("GENERATE TABLE FIRST") dtopush = toolbox.loadMatrixToDict(self.pTableInAll) cDB = DBrequest.DBrequest() cDB.verbose = 0 i = 0 i = 500000 lchem = list(dtopush.keys()) imax = len(lchem) while i < imax: #for chem in dtopush.keys(): chem = lchem[i] outChem = cDB.getRow("dsstox_prop", "db_id='%s'" % (chem)) if outChem == []: wprop = "{" + ",".join([ "\"%s\"" % (dtopush[chem][PROP].replace("'", "")) for PROP in LPROP ]) + "}" cDB.addElement("dsstox_prop", ["db_id", "prop_value"], [chem, wprop]) i = i + 1 return
def importDescriptors(self, prDesc = "/home/borrela2/interference/Desc/DESCbyCAS/"): ddesc = {} dcolor = {} for chemID in self.dDye.keys(): chem = self.dDye[chemID] CASID = chem["casrn"] color = chem["color"] pdescin = prDesc + CASID + ".txt" if path.exists(prDesc + CASID + ".txt"): dtemp = toolbox.loadMatrixToDict(pdescin) ddesc.update(dtemp) dcolor[CASID] = color self.dcolor = dcolor pdesc = self.prout + "descMat" fildesc = open(self.prout + "descMat", "w") ldesc = ddesc[ddesc.keys()[0]].keys() print ldesc print CASID del ldesc[ldesc.index("CAS")] fildesc.write("ID," + ",".join(ldesc) + ",Aff\n") for CASID in ddesc.keys(): lw = [] for desc in ldesc: if desc in ddesc[CASID].keys(): lw.append(str(ddesc[CASID][desc])) else: lw.append("NA") fildesc.write("%s,%s,1\n" % (CASID, ",".join(lw))) fildesc.close() return pdesc
def histRMSD(self, paff, prMDanalysis, prout): if not "paff" in self.__dict__: self.paff = paff daff = toolbox.loadMatrixToDict(paff) self.daff = daff #lig pRMSD = prout + "RMSDprotlig" fRMSD = open(pRMSD, "w") fRMSD.write("CHEMBLid\tRMSDlig\tRMSDca\tRMSDall\tTypeAff\n") for CHEMBLid in daff.keys(): pRMSDin = prMDanalysis + CHEMBLid + "_2hyy_MD/RMSDs/" if not path.exists(pRMSDin): continue RMSDChem = RMSD.RMSD(pRMSDin) RMSDChem.loadRMSDs(["ligand", "protein"]) RMSDprot = RMSDChem.MRMSDprot() RMSDlig = RMSDChem.MRMSDlig() fRMSD.write("%s\t%f\t%f\t%f\t%s\n" % (CHEMBLid, RMSDlig, RMSDprot[0], RMSDprot[1], daff[CHEMBLid]["Type"])) fRMSD.close() runExternalSoft.histRMSD(pRMSD, prout)
def clusterizeTopActive(self, top): prtop = pathFolder.createFolder(self.prout + "top" + str(top) + "/") pafftop = prtop + "Aff" + str(top) + ".csv" if not path.exists(pafftop): # create top descriptor daff = toolbox.loadMatrixToDict(self.paff) if len(daff.keys()) <= top: copyfile(self.paff, pafftop) else: laff = [] for chemID in daff.keys(): laff.append(float(daff[chemID]["Aff"])) minAff = sorted(laff, reverse=True)[top - 1] lchem = [] for chemID in daff.keys(): if float(daff[chemID]["Aff"]) >= minAff: lchem.append(chemID) filout = open(pafftop, "w") filout.write("CHEMBLID\tAff\tType\n") for chem in lchem: filout.write("%s\t%s\t%s\n" % (daff[chem]["CHEMBLID"], daff[chem]["Aff"], daff[chem]["Type"])) filout.close() runExternalSoft.clusterize(self.pdesc, pafftop, self.typeAff, self.cutoff, prtop)
def formatChemForToolChem(self, split_nbChem): pr_out = pathFolder.createFolder(self.pr_out + "ForToolchem/") self.pr_forToolChem = pr_out l_filin = listdir(pr_out) if len(l_filin) > 0: # case files are already computed return d_dsstox_name = toolbox.loadMatrixToDict(self.p_chem_name) d_dsstox_SMILES = toolbox.loadMatrixToDict(self.p_chem_SMILES, sep=",") i = 0 ifile = 0 l_dsstoxid = list(d_dsstox_name.keys()) l_dsstoxid = shuffle(l_dsstoxid) imax = len(l_dsstoxid) i_file = 0 while i < imax: if i_file == split_nbChem or i == 0: i_file = 0 if i == 0: f_out = open("%schemlist_1.csv" % (pr_out), "w") else: print(i / split_nbChem + 1) f_out.close() f_out = open( "%schemlist_%i.csv" % (pr_out, i / split_nbChem + 1), "w") f_out.write( "smiles_origin\tdsstox_id\tdrugbank_id\tname\tcasn\n") dsstox_id = l_dsstoxid[i] name = d_dsstox_name[dsstox_id]["preferred_name"] casrn = d_dsstox_name[dsstox_id]["casrn"] try: smiles_origin = d_dsstox_SMILES[dsstox_id]["Original_SMILES"] except: smiles_origin = "" f_out.write("%s\t%s\tNA\t%s\t%s\n" % (smiles_origin, dsstox_id, name, casrn)) i = i + 1 i_file = i_file + 1 f_out.close()
def prepChemForWebsite(PTOX21CHEMSUM, PTOX21CHEM, prout, indb=0): dTox21ChemSum = toolbox.loadMatrixToDict(PTOX21CHEMSUM, sep=",") dTox21Chem = toolbox.loadMatrixToDict(PTOX21CHEM, sep='\t') #print(dTox21Chem) #print(dTox21ChemSum) cDB = DBrequest.DBrequest() dout = {} for chem in dTox21ChemSum: CAS = dTox21ChemSum[chem]["casn"] dtxid = dTox21ChemSum[chem]["dsstox_substance_id"] name = dTox21ChemSum[chem]["chnm"] try: SMILES = dTox21Chem[dtxid]["smiles_origin"] except: SMILES = 0 dout[CAS] = {} dout[CAS]["DTXID"] = dtxid dout[CAS]["name"] = name dout[CAS]["SMILES"] = SMILES pfilout = prout + "ChemSum" filout = open(pfilout, "w") filout.write("CAS\tDTXID\tName\tSMILES\n") for chem in dout.keys(): if dout[chem]["SMILES"] == 0: continue filout.write("%s\t%s\t%s\t%s\n" % (chem, dout[chem]["DTXID"], dout[chem]["name"], dout[chem]["SMILES"])) if indb == 1: cDB.addElement("bodymap_chemicals", ["casn", "dsstox_id", "name", "smiles"], [ chem, dout[chem]["DTXID"], dout[chem]["name"], dout[chem]["SMILES"] ]) filout.close() return dout
def generateCentroidFile(self): if not "prmaps" in self.__dict__: print("Generate Maps first") return if not "psplitMap" in self.__dict__: print("Generate the split map first") return else: lpfmap = list(self.psplitMap.values()) #print(lpfmap) pfilout = self.prmaps + "centroids.csv" #if path.exists(pfilout): # return coords1D2D = toolbox.loadMatrixCoords(self.pcoords1D2D, 2) coords3D = toolbox.loadMatrixCoords(self.pcoords3D, 2) dout = {} for pmap in lpfmap: print(pmap) nameMap = pmap.split("/")[-1].split("_")[0] dmap = toolbox.loadMatrixToDict(pmap) print(nameMap) i = 1 while 1: lcoords = [] for chem in dmap.keys(): if int(dmap[chem]["map"]) == i: lcoords.append([ coords1D2D[chem][0], coords1D2D[chem][1], coords3D[chem][0] ]) if lcoords == []: break else: print(len(lcoords)) print(lcoords[0]) coordCentroid = calculate.centroid(lcoords) dout[nameMap + "_" + str(i)] = coordCentroid i = i + 1 #print(dout) filout = open(pfilout, "w") filout.write("map\tx\ty\tz\n") for map in dout.keys(): filout.write("%s\t%s\t%s\t%s\n" % (map, dout[map][0], dout[map][1], dout[map][2])) filout.close()
def loadOPERADesc(pOPERA, table): ddesc = toolbox.loadMatrixToDict(pOPERA, sep = ",") LPROP = list(ddesc[list(ddesc.keys())[0]].keys()) cDB = DBrequest.DBrequest() cDB.verbose = 1 i = 1 for PROP in LPROP[1:]: cDB.addElement(table, ["id", "name"], [i, PROP]) i = i + 1 #loadOPERADesc("/home/borrela2/sandbox/VM/ChemMap2Site/temp/949289/OPERA/ZWRUINPWMLAQRD-UHFFFAOYSA-N.csv", "desc_opera_name")
def get_colnames(self): lp_tripodFile = listdir(self.pr_tripodFiles) p_tripodFile = lp_tripodFile[0] print(p_tripodFile) d_tripod = toolbox.loadMatrixToDict(self.pr_tripodFiles + "/" + p_tripodFile) l_col = [col.lower() for col in list(d_tripod[list(d_tripod.keys())[0]].keys())] self.l_col = l_col return l_col
def computeDescNewChem(self): if not "l_chem_toadd" in self.__dict__: self.extractOnlyNewChem("chemicals", "dsstox_id") self.pr_desc = pathFolder.createFolder(self.pr_out + "DESC/") d_dsstox_SMILES = toolbox.loadMatrixToDict(self.p_chem_SMILES, sep=",") l_chem_add = self.l_chem_toadd shuffle(l_chem_add) i = 0 imax = len(self.l_chem_toadd) print(imax) while i < imax: if i % 1000 == 0: print(i) chem = l_chem_add[i] try: smiles = d_dsstox_SMILES[chem]["Original_SMILES"] except: print(i, ": ERROR in SMILES - ", chem) i = i + 1 continue cChem = CompDesc.CompDesc(smiles, self.pr_desc) cChem.prepChem() if cChem.err == 0: cChem.generateInchiKey() if cChem.err == 1: print("Error inch: %s" % (l_chem_add[i])) i = i + 1 continue # 2D desc cChem.computeAll2D() cChem.writeMatrix("2D") #3D desc cChem.set3DChemical() if cChem.err == 0: cChem.computeAll3D() if cChem.err == 1: print("Error 3D desc: %s -- %s" % (l_chem_add[i], i)) else: cChem.writeMatrix("3D") else: print("Error 3D generation: %s -- %s" % (l_chem_add[i], i)) else: print("Error prep: %s -- %s" % (l_chem_add[i], i)) i = i + 1
def updateNameAndCAS(self, name_table): """Function use to update the chemical table => error in the prefered name""" cmd_SQL = "SELECT id, dsstox_id, casn, name FROM %s " % (name_table) l_chem_DB = self.cDB.execCMD(cmd_SQL) d_chem_DB = {} for chem_DB in l_chem_DB: d_chem_DB[chem_DB[1]] = [chem_DB[0], chem_DB[2], chem_DB[3]] d_dsstox_name = toolbox.loadMatrixToDict(self.p_chem_name) d_dsstox_SMILES = toolbox.loadMatrixToDict(self.p_chem_SMILES, sep=",") i = 0 l_dsstoxid = list(d_dsstox_name.keys()) imax = len(l_dsstoxid) j = 0 while i < imax: if i % 50000 == 0: print(i) dsstox_id = l_dsstoxid[i] name = d_dsstox_name[dsstox_id]["preferred_name"].replace( "'", "''") casrn = d_dsstox_name[dsstox_id]["casrn"] try: id_db = d_chem_DB[dsstox_id][0] name_db = d_chem_DB[dsstox_id][2] cas_db = d_chem_DB[dsstox_id][1] except: i = i + 1 continue if name_db != name or casrn != cas_db: cmd_sql = "UPDATE %s SET casn = '%s', name = '%s' WHERE id='%s';" % ( name_table, casrn, name, id_db) j = j + 1 self.cDB.updateTable(cmd_sql) i = i + 1
def loadRMSDs(self, lload=["ligand", "protein", "residues"]): for typeRMSD in lload: if typeRMSD == "ligand": dlig = {} dlig["RMSF"] = {} pRMSF = self.prin + typeRMSD + "/ligRMSF" dlig["RMSF"] = toolbox.loadMatrixToDict(pRMSF) pShaEP = self.prin + typeRMSD + "/ligShaEP" dlig["ShaEP"] = toolbox.matrixToList(pShaEP) self.lig = dlig if typeRMSD == "protein": pRMSD = self.prin + typeRMSD + "/protRMSD" dprot = toolbox.matrixToList(pRMSD) self.prot = dprot if typeRMSD == "residues": pRes = self.prin + typeRMSD + "/resRMSD" dres = toolbox.loadMatrixToDict(pRes) self.res = dres
def pushDSSTOXNeighbors(self, prin): cDB = DBrequest.DBrequest() cDB.verbose = 0 lfile = listdir(prin) for fileNeighbor in lfile: try: dneighbor = toolbox.loadMatrixToDict(prin + fileNeighbor) except: remove(prin + fileNeighbor) continue inchkey = list(dneighbor.keys())[0] dneighbor[inchkey]["Neighbors"] = dneighbor[inchkey][ "Neighbors"].split(" ") w3D = "{" + ",".join([ "\"%s\"" % (neighbor) for neighbor in dneighbor[inchkey]["Neighbors"] ]) + "}" cDB.addElement("dsstox_neighbors", ["inchikey", "neighbors_dim3"], [inchkey, w3D]) return
def loadLC50(self): d_LC50 = toolbox.loadMatrixToDict(self.p_LD50) for chem in d_LC50.keys(): dtxsid = d_LC50[chem]["DTXSID"] if not dtxsid in list(self.dout.keys()): self.dout[dtxsid] = {} self.dout[dtxsid]["very_toxic"] = "NA" self.dout[dtxsid]["nontoxic"] = "NA" self.dout[dtxsid]["LD50_mgkg"] = "NA" self.dout[dtxsid]["EPA_category"] = "NA" self.dout[dtxsid]["GHS_category"] = "NA" self.dout[dtxsid]["LD50_mgkg_Literature"] = d_LC50[chem][ "LD50_mgkg_Literature"] self.dout[dtxsid]["log(LD50_Literature)"] = d_LC50[chem][ "log(LD50_Literature)"] self.dout[dtxsid]["consensus_LD50"] = d_LC50[chem][ "consensus_LD50"] self.dout[dtxsid]["concordance_LD50"] = d_LC50[chem][ "concordance_LD50"]
def importDescriptors(self, prDesc="/home/borrela2/interference/Desc/DESCbyCAS/" ): ddesc = {} dwave = {} for chemID in self.DB.keys(): chem = self.DB[chemID] CASID = chem["Structure"].split("_")[1] Abs = chem["Wavelength"] pdescin = prDesc + CASID + ".txt" if path.exists(prDesc + CASID + ".txt"): dtemp = toolbox.loadMatrixToDict(pdescin) ddesc.update(dtemp) dwave[CASID] = Abs self.dwave = dwave pdesc = self.prout + "descMat" fildesc = open(self.prout + "descMat", "w") ldesc = ddesc[ddesc.keys()[0]].keys() print ldesc print CASID del ldesc[ldesc.index("CAS")] fildesc.write("ID," + ",".join(ldesc) + ",Aff\n") for CASID in ddesc.keys(): lw = [] for desc in ldesc: if desc in ddesc[CASID].keys(): lw.append(str(ddesc[CASID][desc])) else: lw.append("NA") fildesc.write("%s,%s,1\n" % (CASID, ",".join(lw))) fildesc.close() return pdesc
def updateDescOPERA(self): # 1. load opera descriptor used for the DB cmd_SQL = "SELECT id, name FROM chem_descriptor_opera_name_new " l_opera_desc = self.cDB.execCMD(cmd_SQL) d_opera_desc = {} for opera_desc in l_opera_desc: id_opera = int(opera_desc[0]) desc_name = opera_desc[1] d_opera_desc[id_opera] = desc_name # 2. load precomputed OPERA desc d_dsstox2dsctox = toolbox.loadMatrixToDict(self.p_dtxsid_dtxcid_map, sep="\t") #print(list(d_dsstox2dsctox.keys())[1]) #print(d_dsstox2dsctox[list(d_dsstox2dsctox.keys())[1]]) # 3. load inch and dtx id from chemicals table cmd_SQL = "SELECT dsstox_id, inchikey FROM chemicals WHERE inchikey is not null" l_chemicalsDB = self.cDB.execCMD(cmd_SQL) # 3.2 load inch without desc opera cmd_SQL = "SELECT inchikey FROM chemical_description WHERE desc_opera is null" l_inchDB = self.cDB.execCMD(cmd_SQL) l_inch_toupdate = [] for inch in l_inchDB: l_inch_toupdate.append(inch[0]) l_inch_toupdate.sort() d_chemicalsDB = {} for chemicalsDB in l_chemicalsDB: inchikey = chemicalsDB[1] if toolbox.binary_search(l_inch_toupdate, inchikey) == -1: continue dsstoxid = chemicalsDB[0] if dsstoxid == None: continue try: d_chemicalsDB[inchikey].append(dsstoxid) except: d_chemicalsDB[inchikey] = [dsstoxid] d_update = {} for inchikey in d_chemicalsDB.keys(): for dsstox_sid in d_chemicalsDB[inchikey]: try: dsstox_cid = d_dsstox2dsctox[dsstox_sid][ "dsstox_compound_id"] except: continue d_update[dsstox_cid] = {} d_update[dsstox_cid]["inchikey"] = inchikey d_update[dsstox_cid]["opera"] = {} for i_desc_opera in d_opera_desc.keys(): d_update[dsstox_cid]["opera"][ d_opera_desc[i_desc_opera]] = "-9999" break # 4. load OPERA desccriptors for pr_OPERA_pred in self.l_prOPERA_pred: l_p_fopera = listdir(pr_OPERA_pred) for p_fopera in l_p_fopera: print(pr_OPERA_pred + p_fopera) d_temp = toolbox.loadMatrixToDict(pr_OPERA_pred + p_fopera, sep=",") for dtx_cid in d_update.keys(): for desc in d_update[dtx_cid]["opera"].keys(): try: d_update[dtx_cid]["opera"][desc] = float( d_temp[dtx_cid][desc]) except: pass # 5. update DB j = 0 l_dtx_cid = list(d_update.keys()) jmax = len(l_dtx_cid) shuffle(l_dtx_cid) while j < jmax: dtx_cid = l_dtx_cid[j] inchikey = d_update[dtx_cid]["inchikey"] l_toadd = [] i = 1 imax = len(list(d_opera_desc.keys())) + 1 while i < imax: l_toadd.append(d_update[dtx_cid]["opera"][d_opera_desc[i]]) i = i + 1 wOPERA = "{" + ",".join( ["\"%s\"" % (str(desc_val)) for desc_val in l_toadd]) + "}" cmd_sql = "UPDATE chemical_description SET desc_opera = '%s' WHERE inchikey='%s';" % ( wOPERA, inchikey) self.cDB.verbose = 0 self.cDB.updateTable(cmd_sql) if j % 1000 == 0: print(j) j = j + 1
def generateNeighborMatrix(self, nbNeighbor, lnDim): if not "pcoords1D2D" in self.__dict__: print("Compute Coord first") return 1 else: if self.nameMap == "dsstox": # no N dimension because to slow prNeighbor = pathFolder.createFolder(self.prout + "Neighbors/") #pfilout = prNeighbor + "Table_DIM1D2D-2_1.csv" #if path.exists(pfilout): # return dDim1D2D = toolbox.loadMatrixCoords(self.pcoords1D2D, 2) dDim3D = toolbox.loadMatrixCoords(self.pcoords3D, 2) lpfmap = self.psplitMap lmap = [] for imap in lpfmap.keys(): lmap.append(toolbox.loadMatrixToDict(lpfmap[imap])) #print(lmap) # from 1D2D coord lchem = list(dDim1D2D.keys()) shuffle(lchem) i = 0 imax = len(lchem) while i < imax: inch = lchem[i] pfilout = prNeighbor + inch if path.exists(pfilout): i = i + 1 continue filout = open(pfilout, "w") filout.write("ID\tNeighbors\n") # define map where we inspect linmap = [] for dmap in lmap: mapin = int(dmap[inch]["map"]) for chem in dmap.keys(): if int(dmap[chem]["map"]) == mapin or int( dmap[chem]["map"]) == (mapin + 1) or int( dmap[chem]["map"]) == (mapin - 1): if not chem in lmap: linmap.append(chem) print(len(linmap)) ddist = {} ddist[inch] = {} for ID in linmap: if ID != inch: ddist[inch][ID] = sqrt( sum([(xi - yi)**2 for xi, yi in zip( [ dDim1D2D[ID][0], dDim1D2D[ID][1], dDim3D[ID][0] ], [ dDim1D2D[inch][0], dDim1D2D[inch][1], dDim3D[inch][0] ], )])) lID = [ i[0] for i in sorted(ddist[inch].items(), key=lambda x: x[1]) ][:nbNeighbor] filout.write("%s\t%s\n" % (inch, " ".join(lID))) filout.close() i = i + 1 else: # compute all dimension openning withou restriction if lnDim == []: dDim1D2D = toolbox.loadMatrixToDict(self.pcoords1D2D, sep=",") dDim3D = toolbox.loadMatrixToDict(self.pcoords3D, sep=",") chem1 = list(dDim1D2D.keys())[0] n1D2D = len(list(dDim1D2D[chem1].keys())) - 1 n3D = len(list(dDim3D[chem1].keys())) - 1 lnDim = [n1D2D, n3D] prNeighbor = pathFolder.createFolder(self.prout + "Neighbors/") pfilout = prNeighbor + "Table_DIM1D2D-" + str( lnDim[0]) + "_" + str(lnDim[1]) + ".csv" if path.exists(pfilout): return else: dDim1D2D = toolbox.loadMatrixToDict(self.pcoords1D2D, sep=",") dDim3D = toolbox.loadMatrixToDict(self.pcoords3D, sep=",") dcor = {} # from 1D2D coord for inch in dDim1D2D.keys(): dcor[inch] = [] i = 1 while i <= lnDim[0]: dcor[inch].append( float(dDim1D2D[inch]["DIM" + str(i)])) i = i + 1 i = 1 while i <= lnDim[1]: dcor[inch].append( float(dDim3D[inch]["DIM3-" + str(i)])) i = i + 1 ddist = {} for ID in dcor.keys(): ddist[ID] = {} for ID2 in dcor.keys(): if ID != ID2: ddist[ID][ID2] = sqrt( sum([ (xi - yi)**2 for xi, yi in zip(dcor[ID], dcor[ID2]) ])) lID = [ i[0] for i in sorted(ddist[ID].items(), key=lambda x: x[1]) ][:nbNeighbor] ddist[ID] = lID # write in table ftable = open(pfilout, "w") ftable.write("ID\tNeighbors\n") for ID in ddist.keys(): ftable.write("%s\t%s\n" % (ID, " ".join(ddist[ID]))) ftable.close()
def generateTablePropAllDSSTOX(self, prDSSTOXPred, pknownSDF, pLD50, pDSSTOXMapOnCID, insertDB=0): pTableinfo = self.prout + "tablePropForDB.csv" if path.exists(pTableinfo) and insertDB == 0: self.pTableInAll = pTableinfo return #print ("LOAD INFO FROM DCHEM") #if not "dchem" in self.__dict__: # self.loadlistChem() # intialisation ful dictionnary dDSSTOX = {} dmapCIDtoSID = {} print("LOAD INFO MAP SID to CID") filMap = open(pDSSTOXMapOnCID, "r", encoding="utf8", errors="ignore") llines = filMap.readlines() filMap.close() lhead = llines[0].replace("\"", "") lhead = lhead.strip().split(",") #print(lhead) iDSSSID = lhead.index("dsstox_substance_id") iDSSCID = lhead.index("DSSTox_Structure_Id") iname = lhead.index("preferred_name") i = 1 imax = len(llines) while i < imax: ##################################### lineClean = toolbox.formatLine(llines[i]) lelem = lineClean.strip().split(",") try: dDSSTOX[lelem[iDSSSID]] = {} dDSSTOX[lelem[iDSSSID]]["preferred_name"] = lelem[iname] dDSSTOX[lelem[iDSSSID]]["SMILES"] = self.dchem[ lelem[iDSSSID]]["smiles_clean"] dDSSTOX[lelem[iDSSSID]]["inchikey"] = self.dchem[ lelem[iDSSSID]]["inchikey"] dmapCIDtoSID[lelem[iDSSCID]] = lelem[iDSSSID] except: pass i = i + 1 filMap.close() print("INIT DICTIONNARY") # put in dict out -> initialization to NA for chem in dDSSTOX.keys(): for PROP in LPROP[3:]: try: dDSSTOX[chem][PROP] = "NA" except: break print("LOAD PRED") # load prediction and update table lppred = listdir(prDSSTOXPred) for ppred in lppred: ########################################## if ppred[-3:] == "csv": print(ppred, "Load file") dtemp = toolbox.loadMatrixToDict(prDSSTOXPred + ppred, sep=",") k1 = list(dtemp.keys())[0] #print(dtemp[k1]) #dddd for chemIDtemp in dtemp.keys(): DTXCID = dtemp[chemIDtemp]["MoleculeID"] try: DTXSID = dmapCIDtoSID[DTXCID] except: continue for k in dtemp[chemIDtemp].keys(): if k in LPROP[3:]: dDSSTOX[DTXSID][k] = dtemp[chemIDtemp][k] print("PRED LOAD") print("LOAD SDF AND LD50") #load sdf dsdf = parseSDF.parseSDF(pknownSDF, "InChI Key_QSARr", self.prout) dsdf.parseAll() #load LD50 file dLD50 = toolbox.loadMatrixToDict(pLD50) print("SDF and table LD50 loaded") for chem in dDSSTOX.keys(): tempinchKey = dDSSTOX[chem]["inchikey"] # look sdf -> map on the sdf for dchemIDsdf in dsdf.lc: if dchemIDsdf["InChI Key_QSARr"] == tempinchKey: for ksdf in dchemIDsdf.keys(): if ksdf in LPROP[3:]: dDSSTOX[chem][ksdf] = dchemIDsdf[ksdf] # look in LD50 file -> map on the LD50 for chemIDLD50 in dLD50.keys(): if dLD50[chemIDLD50]["InChI Key_QSARr"] == tempinchKey: for kLD50 in dLD50[chemIDLD50].keys(): if kLD50 in LPROP[3:]: dDSSTOX[chem][kLD50] = dLD50[chemIDLD50][kLD50] print("WRITE TABLE") # load MAP filout = open(pTableinfo, "w") filout.write("ID\t%s\n" % ("\t".join(LPROP))) for chem in dDSSTOX.keys(): filout.write( "%s\t%s\n" % (chem, "\t".join([str(dDSSTOX[chem][prop]) for prop in LPROP]))) filout.close() self.pTableInAll = pTableinfo
def pushCoords(self): self.cDB.connOpen() d_coords_1D2D = toolbox.loadMatrixToDict(self.pr_coords + "coord1D2D.csv", sep=",") d_coords_3D = toolbox.loadMatrixToDict(self.pr_coords + "coord3D.csv", sep=",") l_inchikey = self.cDB.extractColoumn( "chemical_description", "inchikey", "WHERE dim1d2d is null AND map_name = '%s';" % (self.map_name)) l_inchikey = [inch[0] for inch in l_inchikey] self.cDB.connClose() shuffle(l_inchikey) imax = len(l_inchikey) i_inch = 0 while i_inch < imax: try: wdim1d2d = "{" + ",".join([ "\"%s\"" % (str(d_coords_1D2D[l_inchikey[i_inch]]["DIM%s" % (i)])) for i in range(1, 11) ]) + "}" except: wdim1d2d = "" try: wdim3d = "{" + ",".join([ "\"%s\"" % (str(d_coords_3D[l_inchikey[i_inch]]["DIM3-%s" % (i)])) for i in range(1, 11) ]) + "}" except: wdim3d = "" if wdim1d2d != "" and wdim3d != "": wd3_cube = "{\"%s\",\"%s\",\"%s\"}" % ( d_coords_1D2D[l_inchikey[i_inch]]["DIM1"], d_coords_1D2D[l_inchikey[i_inch]]["DIM2"], d_coords_3D[l_inchikey[i_inch]]["DIM3-1"]) cmd_sql = "UPDATE chemical_description SET dim1d2d = '%s', dim3d = '%s', d3_cube = '%s' WHERE inchikey='%s' AND map_name = '%s';" % ( wdim1d2d, wdim3d, wd3_cube, l_inchikey[i_inch], self.map_name) self.cDB.updateTable(cmd_sql) i_inch = i_inch + 1 elif wdim1d2d != "" and wdim3d == "": cmd_sql = "UPDATE chemical_description SET dim1d2d = '%s' WHERE inchikey='%s' AND map_name = '%s';" % ( wdim1d2d, l_inchikey[i_inch], self.map_name) self.cDB.updateTable(cmd_sql) i_inch = i_inch + 1 elif wdim1d2d == "" and wdim3d != "": cmd_sql = "UPDATE chemical_description SET dim3d = '%s' WHERE inchikey='%s' AND map_name = '%s';" % ( wdim3d, l_inchikey[i_inch], self.map_name) self.cDB.updateTable(cmd_sql) i_inch = i_inch + 1 else: i_inch = i_inch + 1 return
def formatInfo(db, pdesc, lkinfo, pjs, prout): if path.exists(pjs): js = open(pjs, "a") else: js = open(pjs, "w") # write headers js.write("function loadInfoDrug(){\n") js.write(" var infodrug={") # load 1D2D desc if path.exists(pdesc): ddesc = toolbox.loadMatrixToDict(pdesc) else: return lw = [] # write JS for cpd in db.lc: namecpd = cpd[db.name] linfo = [] for kinfo in lkinfo: if kinfo in list(cpd.keys()): if cpd[kinfo] != "": linfo.append("\"" + str(cpd[kinfo]) + "\"") else: linfo.append("\"NA\"") elif namecpd in list(ddesc.keys()) and kinfo in list( ddesc[namecpd].keys()): if ddesc[namecpd][kinfo] != "": linfo.append("\"" + str(ddesc[namecpd][kinfo]) + "\"") else: linfo.append("\"NA\"") else: linfo.append("\"NA\"") linenew = "\"" + str(namecpd) + "\"" + ":[" + ",".join(linfo) + "]" lw.append(linenew) js.write(",".join(lw) + "};\n") js.write(" return(infodrug);\n};\n\n\n") js.close() pinfo = prout + "tableinfo.csv" finfo = open(pinfo, "w") finfo.write("ID\t" + "\t".join(lkinfo) + "\n") for cpd in db.lc: namecpd = cpd[db.name] linfo = [] for kinfo in lkinfo: if kinfo in list(cpd.keys()): if cpd[kinfo] != "": linfo.append(str(cpd[kinfo])) else: linfo.append("NA") elif namecpd in list(ddesc.keys()) and kinfo in list( ddesc[namecpd].keys()): if ddesc[namecpd][kinfo] != "": linfo.append(str(ddesc[namecpd][kinfo])) else: linfo.append("NA") else: linfo.append("NA") finfo.write("%s\t%s\n" % (namecpd, "\t".join(linfo))) finfo.close()
def loadlistChem(self): prForDB = pathFolder.createFolder(self.prout + "forDB/") pfilout = prForDB + "db.csv" #try:remove(pfilout) #except:pass #print(pfilout) if path.exists(pfilout): dchem = toolbox.loadMatrixToDict(pfilout, sep="\t") else: dchem = {} if self.nameMap == "dsstox": dchemIn = toolbox.loadMatrixToDict( self.plistChem, sep=",") #rewrite pfas and tox21 with comma else: dchemIn = toolbox.loadMatrixToDict(self.plistChem, sep="\t") for chemIn in dchemIn.keys(): if "SMILES" in list(dchemIn[chemIn].keys()): SMILES_origin = dchemIn[chemIn]["SMILES"] DTXSID = dchemIn[chemIn]["DTXSID"] elif "Original_SMILES" in list(dchemIn[chemIn].keys()): SMILES_origin = dchemIn[chemIn]["Original_SMILES"] DTXSID = dchemIn[chemIn]["dsstox_substance_id"] else: print("ERROR") return dchem[DTXSID] = {} dchem[DTXSID]["db_id"] = DTXSID dchem[DTXSID]["smiles_origin"] = SMILES_origin # prepare ligand cchem = Chemical.Chemical(SMILES_origin, self.prDesc) cchem.prepChem() if cchem.err == 1: qsar_ready = 0 cleanSMILES = "NA" inchikey = "NA" else: qsar_ready = 1 cleanSMILES = cchem.smi inchikey = cchem.generateInchiKey() cchem.writeSMIClean() dchem[DTXSID]["smiles_clean"] = cleanSMILES dchem[DTXSID]["inchikey"] = inchikey dchem[DTXSID]["qsar_ready"] = qsar_ready # write table for control -> after open and put in the DB filout = open(pfilout, "w", encoding="utf8") filout.write( "db_id\tsmiles_origin\tsmiles_clean\tinchikey\tqsar_ready\t%s\n" % (self.nameMap)) for chem in dchem.keys(): filout.write( "%s\t%s\t%s\t%s\t%s\t%s\n" % (chem, dchem[chem]["smiles_origin"], dchem[chem]["smiles_clean"], dchem[chem]["inchikey"], dchem[chem]["qsar_ready"], 1)) filout.close() self.dchem = dchem
def splitMap(self, nbsplit, dim, insertDB=0): if not "prmap" in self.__dict__: print("Generate the map files first") return else: prout = pathFolder.createFolder(self.prmap + "split_" + str(nbsplit) + "/") self.prmaps = prout if not "psplitMap" in self.__dict__: self.psplitMap = {} # generate only one file with chem and map if dim == 1: pfilout = prout + "mapx_split.csv" elif dim == 2: pfilout = prout + "mapy_split.csv" else: pfilout = prout + "mapz_split.csv" self.psplitMap[dim] = pfilout if path.exists(pfilout) and insertDB == 0: return elif not path.exists(pfilout): coord1D2D = self.prmap + "coord1D2D.csv" coord3D = self.prmap + "coord3D.csv" if dim == 1 or dim == 2: din = toolbox.loadMatrixCoords(coord1D2D, 2) else: din = toolbox.loadMatrixCoords(coord3D, 2) # max and min 1D2D maxDim = 0.0 minDim = 0.0 nbchem = len(list(din.keys())) nbchembymap = int(nbchem / nbsplit) # calibrate max and min print("== Initiate calibration ==") for chem in din.keys(): if dim == 1 or dim == 3: dimVal = din[chem][0] elif dim == 2: dimVal = din[chem][1] if dimVal > maxDim: maxDim = dimVal if dimVal < minDim: minDim = dimVal print("== End calibration ==") dmap = {} imap = 1 dmap[imap] = [] dimVal = minDim while dimVal < maxDim: dimVal = dimVal + 0.10 if len(dmap[imap]) > nbchembymap: imap = imap + 1 dmap[imap] = [] ichem = 0 lchem = list(din.keys()) nbchem = len(lchem) while ichem < nbchem: if dim == 1 or dim == 3: valtemp = din[lchem[ichem]][0] elif dim == 2: valtemp = din[lchem[ichem]][1] if valtemp < dimVal: dmap[imap].append(deepcopy(lchem[ichem])) del din[lchem[ichem]] del lchem[ichem] nbchem = nbchem - 1 continue else: ichem = ichem + 1 print("==== Write output ====") filout = open(pfilout, "w") filout.write("inchikey\tmap\n") for d in dmap.keys(): for chem in dmap[d]: filout.write("%s\t%s\n" % (chem, d)) filout.close() if insertDB == 1: cDB = DBrequest.DBrequest() #cDB.verbose = 1 dmap = toolbox.loadMatrixToDict(pfilout) tableIn = "dsstox_coords" if dim == 1: mapIn = "mapx" elif dim == 2: mapIn = "mapy" else: mapIn = "mapz" for chem in dmap.keys(): inch = chem.replace("\"", "") cmdSQL = "UPDATE %s SET %s=%s WHERE inchikey='%s';" % ( tableIn, mapIn, dmap[chem]["map"], inch) cDB.updateTable(cmdSQL)
def computeCoords(self, corVal, distributionVal, insertDB=1): if not "p1D2D" in self.__dict__ and not "p3D" in self.__dict__: self.computeDesc(insertDB=0, w=1) # create coords prmap = pathFolder.createFolder(self.prout + "map_" + str(corVal) + "-" + str(distributionVal) + "/") self.prmap = prmap pcoordDim1Dim2 = prmap + "coord1D2D.csv" pcoordDim3D = prmap + "coord3D.csv" if not path.exists(pcoordDim1Dim2) or not path.exists(pcoordDim3D): runExternalSoft.RComputeMapFiles(self.p1D2D, self.p3D, prmap, corVal, distributionVal) if not path.exists(pcoordDim1Dim2) or not path.exists(pcoordDim3D): print("ERROR file map") return else: self.pcoords1D2D = pcoordDim1Dim2 self.pcoords3D = pcoordDim3D if insertDB == 1: if self.nameMap == "dsstox": dcoord1D2D = toolbox.loadMatrixCoords(pcoordDim1Dim2, 10) dcoord3D = toolbox.loadMatrixCoords(pcoordDim3D, 10) else: dcoord1D2D = toolbox.loadMatrixToDict(pcoordDim1Dim2, ",") dcoord3D = toolbox.loadMatrixToDict(pcoordDim3D, ",") cDB = DBrequest.DBrequest() cDB.verbose = 0 lchem = list(dcoord1D2D.keys()) i = 0 imax = len(lchem) while i < imax: #out1D2D = cDB.getRow("%s_coords"%(self.nameMap), "inchikey='%s'" % (chem)) #if out1D2D == []: if self.nameMap == "dsstox": w1D2D = "{" + ",".join([ "\"%s\"" % (str(coord)) for coord in dcoord1D2D[lchem[i]] ]) + "}" w3D = "{" + ",".join([ "\"%s\"" % (str(coord)) for coord in dcoord3D[lchem[i]] ]) + "}" cDB.addElement("%s_coords" % (self.nameMap), ["inchikey", "dim1d2d", "dim3d", "in_db"], [lchem[i], w1D2D, w3D, "1"]) del dcoord1D2D[lchem[i]] del dcoord3D[lchem[i]] del lchem[i] imax = imax - 1 else: nbdim1d2d = len(dcoord1D2D[lchem[i]].keys()) - 1 nbdim3d = len(dcoord3D[lchem[i]].keys()) - 1 w1D2D = "{" + ",".join([ "\"%s\"" % (dcoord1D2D[lchem[i]]["DIM" + str(i)]) for i in range(1, nbdim1d2d + 1) ]) + "}" w3D = "{" + ",".join([ "\"%s\"" % (dcoord3D[lchem[i]]["DIM3-" + str(i)]) for i in range(1, nbdim3d + 1) ]) + "}" cDB.addElement("%s_coords" % (self.nameMap), ["inchikey", "dim1d2d", "dim3d", "in_db"], [lchem[i], w1D2D, w3D, "1"]) del dcoord1D2D[lchem[i]] del dcoord3D[lchem[i]] del lchem[i] imax = imax - 1
def UpdateDBChemPropVal(prPred, pInde, psdf, LPROP, prout): # load SDF cSDF = parseSDF.parseSDF(psdf, "CASRN", prout) cSDF.parseAll() dSDF = {} for chem in cSDF.lc: if not chem["Original_SMILES"] in list(dSDF.keys()): dSDF[chem["Original_SMILES"]] = deepcopy(chem) # pIdentifier dSMILES = toolbox.loadMatrixToDict(pInde, sep=",") dSMILES_out = {} for chem in dSMILES.keys(): try: dSMILES_out[dSMILES[chem]["DTXSID"]] = deepcopy(dSMILES[chem]) except: continue # load chem prediction lfilePred = listdir(prPred) dpred = {} for filePred in lfilePred[:1]: print(filePred) dtemp = toolbox.loadMatrixToDict(prPred + filePred, sep=",") # primary key will be DTXSID for DTXCID in dtemp.keys(): DTXSID = dtemp[DTXCID]["dsstox_substance_id"] if not DTXSID in list(dpred.keys()): dpred[DTXSID] = dtemp[DTXCID] # write for DB Update pfiloutDesc = prout + "OPERA_desc_" + "update.csv" filoutDesc = open(pfiloutDesc, "w") filoutDesc.write("DTXSID\t%s\n" % "\t".join(LPROP)) for DTXSID in dpred.keys(): try: SMILES = dSMILES_out[DTXSID]["SMILES"] except: SMILES = "ERROR" iprop = 0 imax = len(LPROP) lval = [] err = 0 while iprop < imax: PROP = LPROP[iprop] #print(PROP) try: val = str(dpred[DTXSID][PROP]) except: try: val = str(dSMILES_out[DTXSID][PROP]) except: try: val = str(dSDF[SMILES][PROP]) except: val = "NA" print(PROP) #err = 1 # break if val == "NaN": val = "NA" lval.append(val) iprop = iprop + 1 if err == 0: filoutDesc.write("%s\t%s\n" % (DTXSID, "\t".join(lval))) filoutDesc.close()