def test2XValClass(self): fName = os.path.join(RDConfig.RDCodeDir, 'ML', 'KNN', 'test_data', 'random_pts.csv') data = DataUtils.TextFileToData(fName) examples = data.GetNamedData() npvals = data.GetNPossibleVals() nvars = data.GetNVars() attrs = list(range(1, nvars + 1)) numNeigh = 11 mod, err = CrossValidate.CrossValidationDriver(examples, attrs, npvals, numNeigh, silent=1) self.assertAlmostEqual(err, 0.01075, 4) neighborList = [] res = mod.ClassifyExample(examples[0], neighborList=neighborList) self.assertEqual(res, 1) self.assertEqual(neighborList[0][1], examples[0]) self.assertEqual(mod.GetName(), '') mod.SetName('name') self.assertEqual(mod.GetName(), 'name') self.assertEqual(mod.type(), 'Classification Model') mod.NameModel('this argument is ignored') self.assertEqual(mod.GetName(), 'Classification Model')
def test2NaiveBayes(self): fName = os.path.join(RDConfig.RDCodeDir, 'ML', 'NaiveBayes', 'test_data', 'stddata.csv') data = DataUtils.TextFileToData(fName) examples = data.GetNamedData() nvars = data.GetNVars() attrs = list(range(1, nvars + 1)) npvals = [0] + [3] * nvars + [2] qBounds = [0] + [2] * nvars + [0] mod, err = CrossValidate.CrossValidationDriver(examples, attrs, npvals, qBounds, mEstimateVal=20.0, silent=True) self.assertTrue(isinstance(mod, NaiveBayesClassifier)) self.assertAlmostEqual(err, 0.1818, 4) self.assertEqual(mod.GetName(), '') mod.SetName('modelName') self.assertEqual(mod.GetName(), 'modelName') mod.NameModel(None) self.assertEqual(mod.GetName(), 'NaiveBayesClassifier') self.assertGreater(len(mod.GetExamples()), 0) self.assertGreater(len(mod.GetTrainingExamples()), 0) self.assertEqual(sorted(mod.GetTrainingExamples() + mod.GetExamples()), sorted(examples))
def test1NaiveBayes(self): fName = os.path.join(RDConfig.RDCodeDir, 'ML', 'NaiveBayes', 'test_data', 'stddata.csv') data = DataUtils.TextFileToData(fName) examples = data.GetNamedData() nvars = data.GetNVars() attrs = range(1, nvars + 1) npvals = [0] + [3] * nvars + [2] qBounds = [0] + [2] * nvars + [0] mod, err = CrossValidate.CrossValidationDriver(examples, attrs, npvals, qBounds, silent=True) self.assertAlmostEqual(mod._classProbs[0], 0.5000, 4) self.assertAlmostEqual(mod._classProbs[1], 0.5000, 4) self.assertAlmostEqual(mod._QBoundVals[1][0], -0.0360, 4) self.assertAlmostEqual(mod._QBoundVals[1][1], 0.114) self.assertAlmostEqual(mod._QBoundVals[2][0], -0.7022, 4) self.assertAlmostEqual(mod._QBoundVals[2][1], -0.16635, 4) self.assertAlmostEqual(mod._QBoundVals[3][0], -0.3659, 4) self.assertAlmostEqual(mod._QBoundVals[3][1], 0.4305, 4) self.assertAlmostEqual(err, 0.2121, 4)
def test1NaiveBayes(self): fName = os.path.join(RDConfig.RDCodeDir, 'ML', 'NaiveBayes', 'test_data', 'stddata.csv') data = DataUtils.TextFileToData(fName) examples = data.GetNamedData() nvars = data.GetNVars() attrs = list(range(1, nvars + 1)) npvals = [0] + [3] * nvars + [2] qBounds = [0] + [2] * nvars + [0] mod, err = CrossValidate.CrossValidationDriver(examples, attrs, npvals, qBounds, silent=True) self.assertAlmostEqual(mod._classProbs[0], 0.5000, 4) self.assertAlmostEqual(mod._classProbs[1], 0.5000, 4) self.assertAlmostEqual(mod._QBoundVals[1][0], -0.0360, 4) self.assertAlmostEqual(mod._QBoundVals[1][1], 0.114) self.assertAlmostEqual(mod._QBoundVals[2][0], -0.7022, 4) self.assertAlmostEqual(mod._QBoundVals[2][1], -0.16635, 4) self.assertAlmostEqual(mod._QBoundVals[3][0], -0.3659, 4) self.assertAlmostEqual(mod._QBoundVals[3][1], 0.4305, 4) self.assertAlmostEqual(err, 0.2121, 4) mod, err = CrossValidate.CrossValidationDriver(examples, attrs, npvals, qBounds, silent=True, calcTotalError=True) self.assertAlmostEqual(mod._classProbs[0], 0.515151, 4) self.assertAlmostEqual(mod._classProbs[1], 0.484848, 4) self.assertAlmostEqual(mod._QBoundVals[1][0], -0.40315, 4) self.assertAlmostEqual(mod._QBoundVals[1][1], 0.114) self.assertAlmostEqual(mod._QBoundVals[2][0], -0.62185, 4) self.assertAlmostEqual(mod._QBoundVals[2][1], -0.19965, 4) self.assertAlmostEqual(mod._QBoundVals[3][0], 0.4305, 4) self.assertAlmostEqual(mod._QBoundVals[3][1], 0.80305, 4) self.assertAlmostEqual(err, 0.14563, 4) mod, err = CrossValidate.CrossValidationDriver( examples, attrs, npvals, qBounds, silent=True, replacementSelection=True) self.assertAlmostEqual(mod._classProbs[0], 0.5131578, 4) self.assertAlmostEqual(mod._classProbs[1], 0.4868421, 4) self.assertAlmostEqual(mod._QBoundVals[1][0], -0.036, 4) self.assertAlmostEqual(mod._QBoundVals[1][1], 0.93465, 4) self.assertAlmostEqual(mod._QBoundVals[2][0], -0.6696, 4) self.assertAlmostEqual(mod._QBoundVals[2][1], -0.19965, 4) self.assertAlmostEqual(mod._QBoundVals[3][0], -1.06785, 4) self.assertAlmostEqual(mod._QBoundVals[3][1], 0.4305, 4) self.assertAlmostEqual(err, 0.3, 4)
def RunIt(details, progressCallback=None, saveIt=1, setDescNames=0): """ does the actual work of building a composite model **Arguments** - details: a _CompositeRun.CompositeRun_ object containing details (options, parameters, etc.) about the run - progressCallback: (optional) a function which is called with a single argument (the number of models built so far) after each model is built. - saveIt: (optional) if this is nonzero, the resulting model will be pickled and dumped to the filename specified in _details.outName_ - setDescNames: (optional) if nonzero, the composite's _SetInputOrder()_ method will be called using the results of the data set's _GetVarNames()_ method; it is assumed that the details object has a _descNames attribute which is passed to the composites _SetDescriptorNames()_ method. Otherwise (the default), _SetDescriptorNames()_ gets the results of _GetVarNames()_. **Returns** the composite model constructed """ details.rundate = time.asctime() fName = details.tableName.strip() if details.outName == '': details.outName = fName + '.pkl' if not details.dbName: if details.qBounds != []: data = DataUtils.TextFileToData(fName) else: data = DataUtils.BuildQuantDataSet(fName) elif details.useSigTrees or details.useSigBayes: details.tableName = fName data = details.GetDataSet(pickleCol=0, pickleClass=DataStructs.ExplicitBitVect) elif details.qBounds != [] or not details.useTrees: details.tableName = fName data = details.GetDataSet() else: data = DataUtils.DBToQuantData( details.dbName, # Function no longer defined fName, quantName=details.qTableName, user=details.dbUser, password=details.dbPassword) composite = RunOnData(details, data, progressCallback=progressCallback, saveIt=saveIt, setDescNames=setDescNames) return composite
def test2XValClass(self): fName = os.path.join(RDConfig.RDCodeDir,'ML','KNN','test_data','random_pts.csv') data = DataUtils.TextFileToData(fName) examples = data.GetNamedData() npvals = data.GetNPossibleVals() nvars = data.GetNVars() attrs = range(1,nvars+1) numNeigh = 11 mod, err = CrossValidate.CrossValidationDriver(examples, attrs, npvals, numNeigh,silent=1) self.assertAlmostEqual(err,0.01075,4)
def test4XValRegress(self): fName = os.path.join(RDConfig.RDCodeDir,'ML','KNN','test_data','random_pts.csv') data = DataUtils.TextFileToData(fName) examples = data.GetNamedData() npvals = data.GetNPossibleVals() nvars = data.GetNVars() attrs = range(1,nvars+1) numNeigh = 11 mod, err = CrossValidate.CrossValidationDriver(examples, attrs, npvals, numNeigh,silent=1, modelBuilder=CrossValidate.makeRegressionModel) # NOTE: this number hasn't been extensively checked self.assertAlmostEqual(err,0.0777,4)
def test2NaiveBayes(self): fName = os.path.join(RDConfig.RDCodeDir, 'ML', 'NaiveBayes', 'test_data', 'stddata.csv') data = DataUtils.TextFileToData(fName) examples = data.GetNamedData() nvars = data.GetNVars() attrs = range(1, nvars + 1) npvals = [0] + [3] * nvars + [2] qBounds = [0] + [2] * nvars + [0] mod, err = CrossValidate.CrossValidationDriver(examples, attrs, npvals, qBounds, mEstimateVal=20.0) assert feq(err, 0.19354)
def test1Neighbors(self): fName = os.path.join(RDConfig.RDCodeDir,'ML','KNN','test_data','random_pts.csv') data = DataUtils.TextFileToData(fName) examples = data.GetNamedData() npvals = data.GetNPossibleVals() nvars = data.GetNVars() attrs = range(1,nvars+1) numNeigh = 11 metric = DistFunctions.EuclideanDist mdl = KNNModel.KNNModel(numNeigh,attrs,metric) pt = examples.pop(0) tgt = [(metric(pt,ex,attrs),ex) for ex in examples] tgt.sort() mdl.SetTrainingExamples(examples) neighbors = mdl.GetNeighbors(pt) for i in range(numNeigh): assert feq(-tgt[i][0],neighbors[i][0]) assert tgt[i][1][0]==neighbors[i][1][0]
def test3Regress(self): # """ a carefully laid out regression data set where the results are clear: """ fName = os.path.join(RDConfig.RDCodeDir, 'ML', 'KNN', 'test_data', 'sample_pts.csv') data = DataUtils.TextFileToData(fName) examples = data.GetNamedData() nvars = data.GetNVars() attrs = list(range(1, nvars + 1)) numNeigh = 4 metric = DistFunctions.EuclideanDist mdl = KNNRegressionModel.KNNRegressionModel(numNeigh, attrs, metric) mdl.SetTrainingExamples(examples) res = mdl.PredictExample([4, -3.5, 2.5, 0]) assert feq(res, 1.25) res = mdl.PredictExample([4, 3, 2, 0]) assert feq(res, 1.5) res = mdl.PredictExample([4, 3, -2.5, 0]) assert feq(res, -1.5) # Use a distance dependent weight for the neighbours res = mdl.PredictExample([4, 3, -2.5, 0], weightedAverage=True) self.assertAlmostEqual(res, -1.6) # Check the case that the example is identical to one of the neighbours (distance = 0) neighborList = [] res = mdl.PredictExample(examples[0], weightedAverage=True, neighborList=neighborList) self.assertAlmostEqual(res, 1.5857864) self.assertEqual(neighborList[0][1], examples[0]) self.assertEqual(mdl.GetBadExamples(), []) self.assertEqual(mdl.GetName(), '') mdl.SetName('name') self.assertEqual(mdl.GetName(), 'name') self.assertEqual(mdl.type(), 'Regression Model') mdl.NameModel('this argument is ignored') self.assertEqual(mdl.GetName(), 'Regression Model') self.assertEqual( sorted(mdl.GetTrainingExamples() + mdl.GetTestExamples()), sorted(examples))
def test3Regress(self): """ a carefully laid out regression data set where the results are clear: """ fName = os.path.join(RDConfig.RDCodeDir,'ML','KNN','test_data','sample_pts.csv') data = DataUtils.TextFileToData(fName) examples = data.GetNamedData() npvals = data.GetNPossibleVals() nvars = data.GetNVars() attrs = range(1,nvars+1) numNeigh = 4 metric = DistFunctions.EuclideanDist mdl = KNNRegressionModel.KNNRegressionModel(numNeigh,attrs,metric) mdl.SetTrainingExamples(examples) res = mdl.PredictExample([4,-3.5,2.5,0]) assert feq(res,1.25) res = mdl.PredictExample([4,3,2,0]) assert feq(res,1.5) res = mdl.PredictExample([4,3,-2.5,0]) assert feq(res,-1.5)
def test1NaiveBayes(self): fName = os.path.join(RDConfig.RDCodeDir, 'ML', 'NaiveBayes', 'test_data', 'stddata.csv') data = DataUtils.TextFileToData(fName) examples = data.GetNamedData() nvars = data.GetNVars() attrs = range(1, nvars + 1) npvals = [0] + [3] * nvars + [2] qBounds = [0] + [2] * nvars + [0] mod, err = CrossValidate.CrossValidationDriver(examples, attrs, npvals, qBounds) assert feq(mod._classProbs[0], 0.54167) assert feq(mod._classProbs[1], 0.45833) assert feq(mod._QBoundVals[1][0], -0.56995) assert feq(mod._QBoundVals[1][1], 0.114) assert feq(mod._QBoundVals[2][0], -0.7022) assert feq(mod._QBoundVals[2][1], -0.2347) assert feq(mod._QBoundVals[3][0], -0.3659) assert feq(mod._QBoundVals[3][1], 1.17275) assert feq(err, 0.16129)
def FingerprintsFromDetails(details, reportFreq=10): data = None if details.dbName and details.tableName: from rdkit.Dbase.DbConnection import DbConnect from rdkit.Dbase import DbInfo from rdkit.ML.Data import DataUtils try: conn = DbConnect(details.dbName, details.tableName) except Exception: import traceback error('Problems establishing connection to database: %s|%s\n' % (details.dbName, details.tableName)) traceback.print_exc() if not details.idName: details.idName = DbInfo.GetColumnNames(details.dbName, details.tableName)[0] dataSet = DataUtils.DBToData(details.dbName, details.tableName, what='%s,%s' % (details.idName, details.smilesName)) idCol = 0 smiCol = 1 elif details.inFileName and details.useSmiles: from rdkit.ML.Data import DataUtils conn = None if not details.idName: details.idName = 'ID' try: dataSet = DataUtils.TextFileToData( details.inFileName, onlyCols=[details.idName, details.smilesName]) except IOError: import traceback error('Problems reading from file %s\n' % (details.inFileName)) traceback.print_exc() idCol = 0 smiCol = 1 elif details.inFileName and details.useSD: conn = None dataset = None if not details.idName: details.idName = 'ID' dataSet = [] try: s = Chem.SDMolSupplier(details.inFileName) except Exception: import traceback error('Problems reading from file %s\n' % (details.inFileName)) traceback.print_exc() else: while 1: try: m = s.next() except StopIteration: break if m: dataSet.append(m) if reportFreq > 0 and not len(dataSet) % reportFreq: message('Read %d molecules\n' % (len(dataSet))) if details.maxMols > 0 and len( dataSet) >= details.maxMols: break for i, mol in enumerate(dataSet): if mol.HasProp(details.idName): nm = mol.GetProp(details.idName) else: nm = mol.GetProp('_Name') dataSet[i] = (nm, mol) else: dataSet = None fps = None if dataSet and not details.useSD: data = dataSet.GetNamedData() if not details.molPklName: fps = FingerprintsFromSmiles(data, idCol, smiCol, **details.__dict__) else: fps = FingerprintsFromPickles(data, idCol, smiCol, **details.__dict__) elif dataSet and details.useSD: fps = FingerprintsFromMols(dataSet, **details.__dict__) if fps: if details.outFileName: outF = open(details.outFileName, 'wb+') for i in range(len(fps)): pickle.dump(fps[i], outF) outF.close() dbName = details.outDbName or details.dbName if details.outTableName and dbName: from rdkit.Dbase.DbConnection import DbConnect from rdkit.Dbase import DbUtils, DbModule conn = DbConnect(dbName) # # We don't have a db open already, so we'll need to figure out # the types of our columns... # colTypes = DbUtils.TypeFinder(data, len(data), len(data[0])) typeStrs = DbUtils.GetTypeStrings( [details.idName, details.smilesName], colTypes, keyCol=details.idName) cols = '%s, %s %s' % (typeStrs[0], details.fpColName, DbModule.binaryTypeName) # FIX: we should really check to see if the table # is already there and, if so, add the appropriate # column. # # create the new table # if details.replaceTable or \ details.outTableName.upper() not in [x.upper() for x in conn.GetTableNames()]: conn.AddTable(details.outTableName, cols) # # And add the data # for ID, fp in fps: tpl = ID, DbModule.binaryHolder(fp.ToBinary()) conn.InsertData(details.outTableName, tpl) conn.Commit() return fps