def __init__(self, params): ''' params being a XValFoldParams object. ''' self.params = params self.fold_id = 'Fold-%d' % params.foldIdx self.confMatrix = ConfusionMatrix()
def __init__(self, params): ''' params being a XValFoldParams object. ''' self.params = params self.fold_id = 'Fold-%d' % params.fold_idx self.confmat = ConfusionMatrix() # write the training and testing databases into a file with open( os.path.join(params.directory, 'train_dbs_%d.db' % params.fold_idx), 'w+') as dbfile: Database.write_dbs(params.learn_dbs, dbfile) with open( os.path.join(params.directory, 'test_dbs_%d.db' % params.fold_idx), 'w+') as dbfile: Database.write_dbs(params.test_dbs, dbfile)
def doXVal(folds, percent, verbose, multicore, noisy, predName, domain, mlnfile, dbfiles, logicLearn, logicInfer, inverse=False, testSetCount=1): startTime = time.time() directory = time.strftime( "%a_%d_%b_%Y_%H:%M:%S_K=" + str(folds) + "_TSC=" + str(testSetCount), time.localtime()) os.mkdir(directory) os.mkdir(os.path.join(directory, 'FOL')) os.mkdir(os.path.join(directory, 'FUZZY')) # set up the logger log = logging.getLogger('xval') fileLogger = FileHandler(os.path.join(directory, 'xval.log')) fileLogger.setFormatter(praclog.formatter) log.addHandler(fileLogger) log.info('Results will be written into %s' % directory) # preparations: Read the MLN and the databases mln_ = readMLNFromFile(mlnfile, verbose=verbose, logic='FuzzyLogic', grammar='PRACGrammar') log.info('Read MLN %s.' % mlnfile) dbs = [] for dbfile in dbfiles: db = readDBFromFile(mln_, dbfile) if type(db) is list: dbs.extend(db) else: dbs.append(db) log.info('Read %d databases.' % len(dbs)) cwpreds = [pred for pred in mln_.predicates if pred != predName] # create the partition of data subsetLen = int(math.ceil(len(dbs) * percent / 100.0)) if subsetLen < len(dbs): log.info('Using only %d of %d DBs' % (subsetLen, len(dbs))) dbs = sample(dbs, subsetLen) if len(dbs) < folds: log.error( 'Cannot do %d-fold cross validation with only %d databases.' % (folds, len(dbs))) exit(0) shuffle(dbs) partSize = int(math.ceil(len(dbs) / float(folds))) partition = [] for i in range(folds): partition.append(dbs[i * partSize:(i + 1) * partSize]) foldRunnables = [] for foldIdx in range(folds): partion_ = list(partition) params = XValFoldParams() params.mln = mln_.duplicate() params.testDBs = [] params.learnDBs = [] for i in range(0, testSetCount): if (foldIdx >= len(partion_)): params.testDBs.extend(partion_[0]) del partion_[0] else: params.testDBs.extend(partion_[foldIdx]) del partion_[foldIdx] for part in partion_: params.learnDBs.extend(part) print 'LEARN DBS :' + str(len(params.learnDBs)) print 'TEST DBS :' + str(len(params.testDBs)) params.foldIdx = foldIdx params.foldCount = folds params.noisyStringDomains = noisy params.directory = directory params.queryPred = predName params.queryDom = domain params.logicInfer = logicInfer foldRunnables.append(XValFold(params)) if multicore: # set up a pool of worker processes try: workerPool = Pool() log.info('Starting %d-fold Cross-Validation in %d processes.' % (folds, workerPool._processes)) result = workerPool.map_async(runFold, foldRunnables).get() workerPool.close() workerPool.join() cm = ConfusionMatrix() for r in result: cm.combine(r.confMatrix) elapsedTimeMP = time.time() - startTime prepareResults(directory, 'FOL') prepareResults(directory, 'FUZZY') except (KeyboardInterrupt, SystemExit, SystemError): log.critical("Caught KeyboardInterrupt, terminating workers") workerPool.terminate() workerPool.join() exit(1) except: log.error('\n' + ''.join(traceback.format_exception(*sys.exc_info()))) exit(1) # startTime = time.time() else: log.info('Starting %d-fold Cross-Validation in 1 process.' % (folds)) for fold in foldRunnables: runFold(fold) prepareResults(directory, 'FOL') prepareResults(directory, 'FUZZY') elapsedTimeSP = time.time() - startTime if multicore: log.info('%d-fold crossvalidation (MP) took %.2f min' % (folds, elapsedTimeMP / 60.0)) else: log.info('%d-fold crossvalidation (SP) took %.2f min' % (folds, elapsedTimeSP / 60.0))
def prepareResults(directory, logic): cm = ConfusionMatrix() for f in os.listdir(os.path.join(directory, logic)): matrix = pickle.load(open(os.path.join(directory, logic, f), 'rb')) cm.combine(matrix) cm.toFile(os.path.join(directory, logic, 'conf_matrix.cm'))
def run(self): ''' Runs the respective fold of the crossvalidation. ''' log = logging.getLogger(self.fold_id) log.info('Running fold %d of %d...' % (self.params.foldIdx + 1, self.params.foldCount)) directory = self.params.directory try: # Apply noisy string clustering log.debug('Transforming noisy strings...') if self.params.noisyStringDomains is not None: noisyStrTrans = NoisyStringTransformer( self.params.mln, self.params.noisyStringDomains, True) learnDBs_ = noisyStrTrans.materializeNoisyDomains( self.params.learnDBs) testDBs_ = noisyStrTrans.transformDBs(self.params.testDBs) else: learnDBs_ = self.params.learnDBs testDBs_ = self.params.testDBs # train the MLN mln = self.params.mln log.debug('Starting learning...') learnedMLN = mln.learnWeights(learnDBs_, method=self.params.learningMethod, verbose=verbose, evidencePreds=[ "is_a", "ac_word", ], partSize=2, optimizer='cg', maxrepeat=1) # store the learned MLN in a file learnedMLN.writeToFile( os.path.join(directory, 'run_%d.mln' % self.params.foldIdx)) log.debug('Finished learning.') # evaluate the MLN log.debug('Evaluating.') learnedMLN.setClosedWorldPred(None) if self.params.cwPreds is None: self.params.cwPreds = [ p for p in mln.predicates if p != self.params.queryPred ] for pred in [ pred for pred in self.params.cwPreds if pred in learnedMLN.predicates ]: learnedMLN.setClosedWorldPred(pred) #FOL cm = ConfusionMatrix() self.evalMLN(learnedMLN, testDBs_, 'FirstOrderLogic', cm) cm.toFile( os.path.join(directory, 'FOL', 'conf_matrix_%d.cm' % self.params.foldIdx)) #FUZZY cm = ConfusionMatrix() self.evalMLN(learnedMLN, testDBs_, 'FuzzyLogic', cm) cm.toFile( os.path.join(directory, 'FUZZY', 'conf_matrix_%d.cm' % self.params.foldIdx)) log.debug('Evaluation finished.') except (KeyboardInterrupt, SystemExit): log.critical("Exiting...") return None
params.learnconf = project.learnconf params.queryconf = project.queryconf params.querypred = predname foldRunnables.append(XValFold(params)) logger.info('Params for fold %d:\n%s' % (fold_idx, str(params))) if multicore: # set up a pool of worker processes try: workerPool = Pool() logger.info('Starting %d-fold Cross-Validation in %d processes.' % (folds, workerPool._processes)) result = workerPool.map_async(runFold, foldRunnables).get() workerPool.close() workerPool.join() cm = ConfusionMatrix() for r in result: cm.combine(r.confmat) elapsedTimeMP = time.time() - startTime cm.toFile(os.path.join(expdir, 'conf_matrix.cm')) # create the pdf table and move it into the log directory # this is a dirty hack since pdflatex apparently # does not support arbitrary output paths pdfname = 'conf_matrix' logger.info('creating pdf if confusion matrix...') cm.toPDF(pdfname) os.rename('%s.pdf' % pdfname, os.path.join(expdir, '%s.pdf' % pdfname)) except (KeyboardInterrupt, SystemExit, SystemError): logger.critical("Caught KeyboardInterrupt, terminating workers") workerPool.terminate()