def testBatch(self): tlog = simple.Compiler( db=os.path.join(testtensorlog.TEST_DATA_DIR,"textcattoy3.cfacts"), prog=os.path.join(testtensorlog.TEST_DATA_DIR,"textcat3.ppr")) trainData = tlog.load_dataset(os.path.join(testtensorlog.TEST_DATA_DIR,"toytrain.exam")) testData = tlog.load_dataset(os.path.join(testtensorlog.TEST_DATA_DIR,"toytest.exam")) mode = trainData.keys()[0] TX,TY = trainData[mode] UX,UY = testData[mode] inference = tlog.inference(mode) trueY = tf.placeholder(tf.float32, shape=UY.shape, name='tensorlog/trueY') correct = tf.equal(tf.argmax(trueY,1), tf.argmax(inference,1)) accuracy = tf.reduce_mean(tf.cast(correct, tf.float32)) test_batch_fd = {tlog.input_placeholder_name(mode):UX, trueY.name:UY} loss = tlog.loss(mode) optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.1) train_step = optimizer.minimize(loss) train_batch_fd = {tlog.input_placeholder_name(mode):TX, tlog.target_output_placeholder_name(mode):TY} session = tf.Session() session.run(tf.global_variables_initializer()) acc0 = session.run(accuracy, feed_dict=test_batch_fd) print 'initial accuracy',acc0 self.assertTrue(acc0<0.6) for i in range(10): print 'epoch',i+1 session.run(train_step, feed_dict=train_batch_fd) acc1 = session.run(accuracy, feed_dict=test_batch_fd) print 'final accuracy',acc1 self.assertTrue(acc1>=0.9) # test a round-trip serialization # saves the db cacheDir = tempfile.mkdtemp() db_file = os.path.join(cacheDir,'simple.db') tlog.set_all_db_params_to_learned_values(session) tlog.serialize_db(db_file) # load everything into a new graph and don't reset the learned params new_graph = tf.Graph() with new_graph.as_default(): tlog2 = simple.Compiler( db=db_file, prog=os.path.join(testtensorlog.TEST_DATA_DIR,"textcat3.ppr"), autoset_db_params=False) # reconstruct the accuracy measure inference2 = tlog2.inference(mode) trueY2 = tf.placeholder(tf.float32, shape=UY.shape, name='tensorlog/trueY2') correct2 = tf.equal(tf.argmax(trueY2,1), tf.argmax(inference2,1)) accuracy2 = tf.reduce_mean(tf.cast(correct2, tf.float32)) # eval accuracy in a new session session2 = tf.Session() session2.run(tf.global_variables_initializer()) test_batch_fd2 = {tlog2.input_placeholder_name(mode):UX, trueY2.name:UY} acc3 = session2.run(accuracy2, feed_dict=test_batch_fd2) print 'accuracy after round-trip serialization',acc3 self.assertTrue(acc3>=0.9) session.close()
def runMain(): (db, prog, modeSet, queries) = expt.setExptParams() tlog = simple.Compiler(db=db, prog=prog, autoset_db_params=False) fps1 = expt.compileAll(db, prog, modeSet, queries) fps2 = tfCompileAll(tlog, modeSet, queries) # expect <= 2.5 fps qps = runTF(tlog) # expect less than 23 qps return fps2, qps
def runMain(): (ti, sparseX) = expt.setExptParams() X = sparseX.todense() # compile all the functions we'll need before I set up the session tlog = simple.Compiler(db=ti.db, prog=ti.prog, autoset_db_params=False) for modeString in [ "t_stress/io", "t_influences/io", "t_cancer_spont/io", "t_cancer_smoke/io" ]: _ = tlog.inference(modeString) session = tf.Session() session.run(tf.global_variables_initializer()) start0 = time.time() for modeString in [ "t_stress/io", "t_influences/io", "t_cancer_spont/io", "t_cancer_smoke/io" ]: session.run(tf.global_variables_initializer()) print 'eval', modeString, fd = {tlog.input_placeholder_name(modeString): X} session.run(tlog.inference(modeString), feed_dict=fd) print 'time', time.time() - start0, 'sec' tot = time.time() - start0 print 'total time', tot, 'sec' return tot
def testMToyMatParam(self): tlog = simple.Compiler( db=os.path.join(testtensorlog.TEST_DATA_DIR,"matchtoy.cfacts"), prog=os.path.join(testtensorlog.TEST_DATA_DIR,"matchtoy.ppr")) trainData = tlog.load_dataset(os.path.join(testtensorlog.TEST_DATA_DIR,"matchtoy-train.exam")) tlog.db.markAsParameter('dabbrev',2) factDict = tlog.db.matrixAsPredicateFacts('dabbrev',2,tlog.db.matEncoding[('dabbrev',2)]) print 'before learning',len(factDict),'dabbrevs' self.assertTrue(len(factDict)==5) for f in sorted(factDict.keys()): print '>',str(f),factDict[f] # expt pipeline mode = trainData.keys()[0] TX,TY = trainData[mode] inference = tlog.inference(mode) trueY = tf.placeholder(tf.float32, shape=TY.shape, name='tensorlog/trueY') loss = tlog.loss(mode) optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.1) train_step = optimizer.minimize(loss) train_batch_fd = {tlog.input_placeholder_name(mode):TX, tlog.target_output_placeholder_name(mode):TY} session = tf.Session() session.run(tf.global_variables_initializer()) for i in range(5): print 'epoch',i+1 session.run(train_step, feed_dict=train_batch_fd) tlog.set_all_db_params_to_learned_values(session)
def testTCToyTypes(self): matrixdb.conf.ignore_types = False tlog = simple.Compiler( db=os.path.join(testtensorlog.TEST_DATA_DIR,"textcattoy3.cfacts"), prog=os.path.join(testtensorlog.TEST_DATA_DIR,"textcat3.ppr")) trainData = tlog.load_small_dataset(os.path.join(testtensorlog.TEST_DATA_DIR,"toytrain.exam")) mode = trainData.keys()[0] docs,labels = trainData[mode] xc = tlog.get_cross_compiler() ops = xc.possibleOps(docs,'doc') print 'doc ops',ops self.assertTrue(len(ops)==1) (words,wordType) = ops[0] self.assertTrue(wordType=='word') ops = xc.possibleOps(words,'word') self.assertTrue(len(ops)==3) pairs = None for (expr,exprType) in ops: if exprType=='labelWordPair': pairs = expr break self.assertTrue(pairs is not None) ops = xc.possibleOps(pairs,'labelWordPair') self.assertTrue(len(ops)==2) for (expr,exprType) in ops: self.assertTrue(exprType=='word') close_cross_compiler(xc)
def runMain(num=250): params = expt.setExptParams(num) prog = params['prog'] tlog = simple.Compiler(db=prog.db, prog=prog, autoset_db_params=False) train_data = tlog.load_big_dataset('inputs/train-%d.exam' % num) mode = params['targetMode'] loss = tlog.loss(mode) optimizer = tf.train.AdagradOptimizer(0.1) train_step = optimizer.minimize(loss) session = tf.Session() session.run(tf.global_variables_initializer()) t0 = time.time() epochs = 10 for i in range(epochs): b = 0 for (_,(TX,TY)) in tlog.minibatches(train_data,batch_size=125): print 'epoch',i+1,'of',epochs,'minibatch',b+1 train_fd = {tlog.input_placeholder_name(mode):TX, tlog.target_output_placeholder_name(mode):TY} session.run(train_step, feed_dict=train_fd) b += 1 print 'learning time',time.time()-t0,'sec' predicted_y = tlog.inference(mode) actual_y = tlog.target_output_placeholder(mode) correct_predictions = tf.equal(tf.argmax(actual_y,1), tf.argmax(predicted_y,1)) accuracy = tf.reduce_mean(tf.cast(correct_predictions, tf.float32)) test_data = tlog.load_small_dataset('inputs/test-%d.exam' % num) UX,UY = test_data[mode] test_fd = {tlog.input_placeholder_name(mode):UX, tlog.target_output_placeholder_name(mode):UY} acc = session.run(accuracy, feed_dict=test_fd) print 'test acc',acc return acc #expect 27.2
def setup_tlog(maxD,factFile,trainFile,testFile): tlog = simple.Compiler(db=factFile,prog="grid.ppr") tlog.prog.db.markAsParameter('edge',2) tlog.prog.maxDepth = maxD trainData = tlog.load_small_dataset(trainFile) testData = tlog.load_small_dataset(testFile) return (tlog,trainData,testData)
def runMain(saveInPropprFormat=True): params = expt.setExptParams() prog = params['prog'] tlog = simple.Compiler(db=prog.db, prog=prog, autoset_db_params=False) train_data = tlog.load_small_dataset('inputs/train.examples') test_data = tlog.load_small_dataset('inputs/test.examples') mode = 'samebib/io' TX, TY = train_data[mode] UX, UY = test_data[mode] loss = tlog.loss(mode) optimizer = tf.train.AdagradOptimizer(0.1) train_step = optimizer.minimize(loss) train_fd = { tlog.input_placeholder_name(mode): TX, tlog.target_output_placeholder_name(mode): TY } test_fd = { tlog.input_placeholder_name(mode): UX, tlog.target_output_placeholder_name(mode): UY } t0 = time.time() session = tf.Session() session.run(tf.global_variables_initializer()) epochs = 30 for i in range(epochs): # progress print 'epoch', i + 1, 'of', epochs session.run(train_step, feed_dict=train_fd) print 'learning time', time.time() - t0, 'sec' inference = tlog.inference(mode) predicted_y = session.run(inference, feed_dict=test_fd) actual_y = tlog.target_output_placeholder(mode) correct_predictions = tf.equal(tf.argmax(actual_y, 1), tf.argmax(predicted_y, 1)) accuracy = tf.reduce_mean(tf.cast(correct_predictions, tf.float32)) if saveInPropprFormat: # save test results in ProPPR format from tensorlog import declare from tensorlog import dataset from tensorlog import expt as tlog_expt m = declare.asMode(mode) native_test_data = dataset.Dataset({m: tlog.xc._unwrapOutput(UX)}, {m: tlog.xc._unwrapOutput(UY)}) savedTestExamples = 'tmp-cache/cora-test.examples' savedTestPredictions = 'tmp-cache/cora-test.solutions.txt' native_test_data.saveProPPRExamples(savedTestExamples, tlog.db) tlog_expt.Expt.predictionAsProPPRSolutions( savedTestPredictions, 'samebib', tlog.db, tlog.xc._unwrapOutput(UX), tlog.xc._unwrapOutput(predicted_y)) print 'ready for commands like: proppr eval %s %s --metric auc --defaultNeg' % ( savedTestExamples, savedTestPredictions) acc = session.run(accuracy, feed_dict=test_fd) print 'test acc', acc return acc
def testBuilder2(self): b = simple.Builder() predict,assign,weighted,hasWord,posPair,negPair = b.predicates("predict assign weighted hasWord posPair negPair") X,Pos,Neg,F,W = b.variables("X Pos Neg F W") b += predict(X,Pos) <= assign(Pos,'pos','label') // (weighted(F) | hasWord(X,W) & posPair(W,F)) b += predict(X,Neg) <= assign(Neg,'neg','label') // (weighted(F) | hasWord(X,W) & negPair(W,F)) dbSpec = os.path.join(testtensorlog.TEST_DATA_DIR,"textcattoy3.cfacts") self.runTextCatLearner(simple.Compiler(db=dbSpec,prog=b.rules))
def testTCToyIgnoringTypes(self): matrixdb.conf.ignore_types = True tlog = simple.Compiler( db=os.path.join(testtensorlog.TEST_DATA_DIR,"textcattoy3.cfacts"), prog=os.path.join(testtensorlog.TEST_DATA_DIR,"textcat3.ppr")) trainData = tlog.load_small_dataset(os.path.join(testtensorlog.TEST_DATA_DIR,"toytrain.exam")) mode = trainData.keys()[0] docs,labels = trainData[mode] xc = tlog.get_cross_compiler() ops = xc.possibleOps(docs) binary_predicates = [functor for (functor,arity) in tlog.db.matEncoding if arity==2] self.assertTrue(len(ops) == len(binary_predicates)*2) for x in ops: # ops should just be tensors self.assertFalse(isinstance(x,tuple)) close_cross_compiler(xc)
def testBuilder3(self): b = simple.Builder() predict,assign,weighted,hasWord,posPair,negPair,label = b.predicates("predict assign weighted hasWord posPair negPair label") doc_t,label_t,word_t,labelWordPair_t = b.types("doc_t label_t word_t labelWordPair_t") b.schema += predict(doc_t,label_t) b.schema += hasWord(doc_t,word_t) b.schema += posPair(word_t,labelWordPair_t) b.schema += negPair(word_t,labelWordPair_t) b.schema += label(label_t) X,Pos,Neg,F,W = b.variables("X Pos Neg F W") b.rules += predict(X,Pos) <= assign(Pos,'pos','label') // (weighted(F) | hasWord(X,W) & posPair(W,F)) b.rules += predict(X,Neg) <= assign(Neg,'neg','label') // (weighted(F) | hasWord(X,W) & negPair(W,F)) # use the untyped version of the facts to make sure the schema works b.db = os.path.join(testtensorlog.TEST_DATA_DIR,"textcattoy.cfacts") self.runTextCatLearner(simple.Compiler(db=b.db, prog=b.rules))
def check_learning_with_udp(self,ruleStrings,plugins): db = matrixdb.MatrixDB.loadFile(os.path.join(testtensorlog.TEST_DATA_DIR,"textcattoy3.cfacts")) rules = testtensorlog.rules_from_strings(ruleStrings) prog = program.ProPPRProgram(rules=rules,db=db,plugins=plugins) prog.setAllWeights() mode = declare.asMode("predict/io") prog.compile(mode) fun = prog.function[(mode,0)] print "\n".join(fun.pprint()) tlog = simple.Compiler(db=db, prog=prog) trainData = tlog.load_dataset(os.path.join(testtensorlog.TEST_DATA_DIR,"toytrain.exam")) testData = tlog.load_dataset(os.path.join(testtensorlog.TEST_DATA_DIR,"toytest.exam")) mode = trainData.keys()[0] TX,TY = trainData[mode] UX,UY = testData[mode] inference = tlog.inference(mode) trueY = tf.placeholder(tf.float32, shape=UY.shape, name='tensorlog/trueY') correct = tf.equal(tf.argmax(trueY,1), tf.argmax(inference,1)) accuracy = tf.reduce_mean(tf.cast(correct, tf.float32)) test_batch_fd = {tlog.input_placeholder_name(mode):UX, trueY.name:UY} loss = tlog.loss(mode) optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.1) train_step = optimizer.minimize(loss) train_batch_fd = {tlog.input_placeholder_name(mode):TX, tlog.target_output_placeholder_name(mode):TY} session = tf.Session() session.run(tf.global_variables_initializer()) acc0 = session.run(accuracy, feed_dict=test_batch_fd) print 'initial accuracy',acc0 self.assertTrue(acc0<0.6) for i in range(10): print 'epoch',i+1 session.run(train_step, feed_dict=train_batch_fd) acc1 = session.run(accuracy, feed_dict=test_batch_fd) print 'final accuracy',acc1 self.assertTrue(acc1>=0.9) session.close()
def runMain(): # generate the data for a 10-by-10 grid (factFile,trainFile,testFile) = expt.genInputs(16) # generate the rules - for transitive closure b = simple.Builder() path,edge = b.predicates("path,edge") X,Y,Z = b.variables("X,Y,Z") b.rules += path(X,Y) <= edge(X,Y) b.rules += path(X,Y) <= edge(X,Z) & path(Z,Y) # construct a Compiler object tlog = simple.Compiler(db=factFile,prog=b.rules) # configure the database so that edge weights are a parameter tlog.prog.db.markAsParameter('edge',2) # configure the program so that maximum recursive depth is 16 tlog.prog.maxDepth = 16 # compile the rules, plus a query mode, into the inference function # we want to optimize - queries of the form {Y:path(x,Y)} where x is # a given starting point in the grid (an input) and Y is an output mode = 'path/io' predicted_y = tlog.inference(mode) # when we ask for an inference function, Tensorlog also compiles a # loss function. ask for the placeholder used to hold the desired # output when we're computing loss, and use that to define an # accuracy metric, for testing actual_y = tlog.target_output_placeholder(mode) correct_predictions = tf.equal(tf.argmax(actual_y,1), tf.argmax(predicted_y,1)) accuracy = tf.reduce_mean(tf.cast(correct_predictions, tf.float32)) # now get up the loss used in learning from the compiler and set up # a learner for it unregularized_loss = tlog.loss(mode) optimizer = tf.train.AdagradOptimizer(1.0) train_step = optimizer.minimize(unregularized_loss) # set up the session session = tf.Session() session.run(tf.global_variables_initializer()) # load the training and test data trainData = tlog.load_small_dataset(trainFile) testData = tlog.load_small_dataset(testFile) # run the optimizer for 20 epochs (tx,ty) = trainData[mode] train_fd = {tlog.input_placeholder_name(mode):tx, tlog.target_output_placeholder_name(mode):ty} for i in range(20): session.run(train_step, feed_dict=train_fd) print 'epoch',i+1,'train loss and accuracy',session.run([unregularized_loss,accuracy], feed_dict=train_fd) # test performance (ux,uy) = testData[mode] test_fd = {tlog.input_placeholder_name(mode):ux, tlog.target_output_placeholder_name(mode):uy} acc = session.run(accuracy, feed_dict=test_fd) print 'test acc',acc return acc
def run_main(): logging.basicConfig(level=logging.DEBUG) t0 = time.time() # configure the experiment, generate the rules, and initialize the # Tensorlog compiler. If we're training, then also initialize the # weight vectors to some sort of default values. c = configure_from_command_line(sys.argv[1:]) b = generate_rules() # databases can be stored in two formats: the .db format or the # .cfacts format. A .cfacts file is basically a tab-separated-value # file, where the first column is a relation name, the other columns # are arguments to that relation, and the final column is a weight # (if it's a number). The .cfacts file can also include typing # information, in lines like '# :- # mentions_entity(question_t,entity_t)' .cfacts files must be sorted # by relation type. The .db format is a binary format which is more # compact and faster to load. The syntax "foo.db|foo.cfacts" for a # database tells Tensorlog to load a cached .db version of the # .cfacts file if it exists (and is more recent than the .cfacts # file) and otherwise to load the .cfacts file and create a cached # version in the .db file. dbspec = '/tmp/train-%d.db|inputs/train-%d.cfacts' % ( c.num, c.num) if c.action == 'train' else c.model tlog = simple.Compiler(db=dbspec, prog=b.rules, autoset_db_params=(c.action == 'train')) # set up the optimizer mode = 'answer/io' unregularized_loss = tlog.loss(mode) optimizer = tf.train.AdagradOptimizer(c.rate) train_step = optimizer.minimize(unregularized_loss) # define the measure we'll use to report quality of a learned model predicted_y = tlog.inference(mode) # inference is the # proof-counting semantics # followed by a softmax actual_y = tlog.target_output_placeholder(mode) correct_predictions = tf.equal(tf.argmax(actual_y, 1), tf.argmax(predicted_y, 1)) accuracy = tf.reduce_mean(tf.cast(correct_predictions, tf.float32)) # initialize the tensorflow session session = tf.Session() session.run(tf.global_variables_initializer()) t1 = time.time() print 'compilation and session initialization', (t1 - t0) / 1000.0, 'sec' if c.action == 'test': # a small_dataset is just a dictionary mapping function names (ie # modes), like "answer/io", to pairs X,Y, where X is an input and # Y a desired output. If the action is to 'test' a learned model # then load in the test data and find that x,y pair. test_data = tlog.load_small_dataset('inputs/test-%d.exam' % c.num) _, (x, y) = test_data.items()[0] # ... then compute error rate and print it test_batch_fd = { tlog.input_placeholder_name(mode): x, tlog.target_output_placeholder(mode): y } print 'test error', 100 * ( 1.0 - session.run(accuracy, feed_dict=test_batch_fd)), '%' else: assert c.action == 'train' # load_big_dataset returns an object which can enumerate # mini-batches. The object holds all the input and output vectors # for tensorlog as sparse vectors (ie, it doesn't stream thru them # from disk). This is still important from memory usage point of # view, however, because we cannot encode these x,y pairs as # sparse in tensorflow, since tensorflow doesn't support sparse # matrix-sparse matrix product, only dense matrix-sparse matrix # product. So a the big dataset object will convert each # minibatch to a dense format on-the-fly before training on it. train_data = tlog.load_big_dataset('inputs/train-%d.exam' % c.num) t2 = time.time() print 'data loading', (t2 - t1), 'sec' # finally, run the learner for a fixed number of epochs for i in range(c.epochs): print 'starting epoch', i + 1, 'of', c.epochs, '...' b = 0 for _, (x, y) in tlog.minibatches(train_data, batch_size=c.batch_size): train_batch_fd = { tlog.input_placeholder_name(mode): x, tlog.target_output_placeholder_name(mode): y } session.run(train_step, feed_dict=train_batch_fd) print 'finished minibatch', b + 1, 'epoch', i + 1, 'cumulative training time', ( time.time() - t2), 'sec' b += 1 t3 = time.time() print 'learning', (t3 - t2), 'sec' # We have now learned values for all the parameters. This command # copies those learned values back into the knowledge # graph/database maintained by Tensorlog. tlog.set_all_db_params_to_learned_values(session) # Finally, write the whole knowledge graph, including the learned # parameters, out to disk in a compact format, which can be read # back in when we use the 'test' action tlog.serialize_db('learned-model.db') print 'wrote learned model to learned-model.db'
def testMinibatch(self): tlog = simple.Compiler( db=os.path.join(testtensorlog.TEST_DATA_DIR,"textcattoy3.cfacts"), prog=os.path.join(testtensorlog.TEST_DATA_DIR,"textcat3.ppr")) self.runTextCatLearner(tlog)
mainOptlist, mainArgs = getopt.getopt(sys.argv[1:], 'x', mainArgspec) mainOptdict = dict(mainOptlist) c = OptHolder() c.kb_version = mainOptdict.get('--kb_version', 'typed-small') c.epochs = int(mainOptdict.get('--epochs', '10')) c.num_train = int(mainOptdict.get('--num_train', '100')) c.num_test = int(mainOptdict.get('--num_test', '200')) c.prog_file = mainOptdict.get('--prog_file', 'dialog.ppr') for (var_name, value) in c.__dict__.items(): command_line_opt = '--%s' % var_name print '# config:', var_name, '=', value, 'from', command_line_opt, mainOptdict.get( command_line_opt) # create the simple compiler and load the data tlog = simple.Compiler(db='idb-%s.cfacts' % c.kb_version, prog=c.prog_file) train_data = tlog.load_dataset('train-%d-corpus.exam' % c.num_train) test_data = tlog.load_dataset('test-%d-corpus.exam' % c.num_test) # check the data is as expected mode = 'answer/io' assert len(train_data.keys()) == 1 and mode in train_data assert len(test_data.keys()) == 1 and mode in train_data TX, TY = train_data[mode] UX, UY = test_data[mode] # for evaluating performance inference = tlog.inference(mode) trueY = tf.placeholder(tf.float32, shape=UY.shape, name='tensorlog/trueY') prediction_is_correct = tf.equal(tf.argmax(trueY, 1), tf.argmax(inference, 1))