def test_materialize(): initial_facts = [ Fact('q', ['{}'.format(idx), '{}'.format(idx + 1)]) for idx in range(64) ] parser = KnowledgeBaseParser(initial_facts) parser.predicate_to_index['p'] = 2 clauses = [ parse_clause('q(X, Z) :- q(X, Y), q(Y, Z)'), parse_clause('p(X, Y) :- q(X, Y)') ] inferred_facts = materialize(initial_facts, clauses, parser) inferred_triples = [(f.argument_names[0], f.predicate_name, f.argument_names[1]) for f in inferred_facts] entities = {s for (s, _, _) in inferred_triples } | {o for (_, _, o) in inferred_triples} for e1 in entities: for e2 in entities: if int(e1) < int(e2): assert (str(e1), 'q', str(e2)) in inferred_triples assert (str(e1), 'p', str(e2)) in inferred_triples print('+') else: assert (str(e1), 'q', str(e2)) not in inferred_triples assert (str(e1), 'p', str(e2)) not in inferred_triples print('-')
def test_losses(): triples = [ ('e1', 'p', 'e2'), ('e2', 'q', 'e3'), ('e1', 'r', 'e2'), ('e2', 's', 'e3') ] def fact(s, p, o): return Fact(predicate_name=p, argument_names=[s, o]) facts = [fact(s, p, o) for s, p, o in triples] parser = KnowledgeBaseParser(facts) nb_predicates = len(parser.predicate_vocabulary) predicate_embedding_size = 100 predicate_embedding_layer = tf.get_variable('predicates', shape=[nb_predicates + 1, predicate_embedding_size], initializer=tf.contrib.layers.xavier_initializer()) clauses = [parse_clause('p(X, Y) :- q(Y, X)'), parse_clause('r(X, Y) :- s(X, Y)')] loss = clauses_to_equality_loss('TransE', clauses, 'l2_sqr', predicate_embedding_layer, parser.predicate_to_index, entity_embedding_size=predicate_embedding_size) for i in range(32): optimizer = tf.train.AdagradOptimizer(0.1) minimization_step = optimizer.minimize(loss, var_list=[predicate_embedding_layer]) init_op = tf.global_variables_initializer() with tf.Session() as session: session.run(init_op) for j in range(32): session.run([minimization_step]) loss_value = session.run([loss])[0] p_idx, q_idx = parser.predicate_to_index['p'], parser.predicate_to_index['q'] r_idx, s_idx = parser.predicate_to_index['r'], parser.predicate_to_index['s'] predicate_embedding_layer_value = session.run([predicate_embedding_layer])[0] p_value, q_value = predicate_embedding_layer_value[p_idx, :], predicate_embedding_layer_value[q_idx, :] r_value, s_value = predicate_embedding_layer_value[r_idx, :], predicate_embedding_layer_value[s_idx, :] estimated_loss_value = np.square(p_value + q_value).sum() + np.square(r_value - s_value).sum() assert loss_value > 0 assert estimated_loss_value > 0 np.testing.assert_allclose(loss_value, estimated_loss_value, 4) tf.reset_default_graph()
def _test_adversarial(): triples = [('john', 'friendOf', 'mark'), ('mark', 'friendOf', 'aleksi'), ('mark', 'friendOf', 'dazdrasmygda')] def fact(s, p, o): return Fact(predicate_name=p, argument_names=[s, o]) facts = [fact(s, p, o) for s, p, o in triples] parser = KnowledgeBaseParser(facts) clauses = [parse_clause('friendOf(X, Y) :- friendOf(Y, X)')] nb_entities = len(parser.entity_vocabulary) nb_predicates = len(parser.predicate_vocabulary) entity_embedding_size = 100 predicate_embedding_size = 100 entity_embedding_layer = tf.get_variable( 'entities', shape=[nb_entities + 1, entity_embedding_size], initializer=tf.contrib.layers.xavier_initializer()) predicate_embedding_layer = tf.get_variable( 'predicates', shape=[nb_predicates + 1, predicate_embedding_size], initializer=tf.contrib.layers.xavier_initializer()) model_class = models.get_function('TransE') similarity_function = similarities.get_function('l1') model_parameters = dict(similarity_function=similarity_function) batch_size = 1000 adversarial = Adversarial( clauses=clauses, parser=parser, entity_embedding_layer=entity_embedding_layer, predicate_embedding_layer=predicate_embedding_layer, model_class=model_class, model_parameters=model_parameters, batch_size=batch_size) init_op = tf.global_variables_initializer() with tf.Session() as session: session.run(init_op) assert len(adversarial.parameters) == 2 for violating_embeddings in adversarial.parameters: shape = session.run(tf.shape(violating_embeddings)) assert (shape == (batch_size, entity_embedding_size)).all() loss_value = session.run(adversarial.loss) errors_value = session.run(adversarial.errors) var1 = adversarial.parameters[0] var2 = adversarial.parameters[1] X_values = session.run(var1 if "X" in var1.name else var2) Y_values = session.run(var2 if "Y" in var2.name else var1) p_value = session.run( tf.nn.embedding_lookup(predicate_embedding_layer, 1)) assert np.array(X_values.shape == (batch_size, entity_embedding_size)).all() assert np.array(Y_values.shape == (batch_size, entity_embedding_size)).all() assert np.array(p_value.shape == (predicate_embedding_size, )) head_scores = -np.sum(np.abs((X_values + p_value) - Y_values), axis=1) body_scores = -np.sum(np.abs((Y_values + p_value) - X_values), axis=1) assert int(errors_value) == np.sum( (head_scores < body_scores).astype(int)) linear_losses = body_scores - head_scores np_loss_values = np.sum(linear_losses * (linear_losses > 0)) assert np.abs(loss_value - np_loss_values) < 1e-3 tf.reset_default_graph()
import logging import pytest logger = logging.getLogger(__name__) triples = [('a', 'p', 'b'), ('c', 'p', 'd'), ('a', 'q', 'b')] facts = [Fact(predicate_name=p, argument_names=[s, o]) for s, p, o in triples] parser = KnowledgeBaseParser(facts) nb_entities = len(parser.entity_to_index) nb_predicates = len(parser.predicate_to_index) # Clauses clause_str = 'q(X, Y) :- p(X, Y)' clauses = [parse_clause(clause_str)] # Instantiating the model parameters model_class = models.get_function('TransE') similarity_function = similarities.get_function('l2_sqr') model_parameters = dict(similarity_function=similarity_function) @pytest.mark.closedform def test_transe_unit_cube(): for seed in range(32): tf.reset_default_graph() np.random.seed(seed) tf.set_random_seed(seed)
def main(argv): def formatter(prog): return argparse.HelpFormatter(prog, max_help_position=100, width=200) argparser = argparse.ArgumentParser( 'Generates a Test-I/Test-II/Test-ALL test sets', formatter_class=formatter) argparser.add_argument('train', action='store', type=str, default=None) argparser.add_argument('valid', action='store', type=str, default=None) argparser.add_argument('test', action='store', type=str, default=None) argparser.add_argument('clauses', action='store', type=str, default=None) argparser.add_argument('--test-I', '-1', type=str, default='./testI.tsv') argparser.add_argument('--test-II', '-2', type=str, default='./testII.tsv') args = argparser.parse_args(argv) train_path, valid_path, test_path = args.train, args.valid, args.test test_I_path, test_II_path = args.test_I, args.test_II train_triples, _ = read_triples(train_path) valid_triples, _ = read_triples(valid_path) test_triples, _ = read_triples(test_path) def fact(s, p, o): return Fact(predicate_name=p, argument_names=[s, o]) train_facts = [fact(s, p, o) for s, p, o in train_triples] valid_facts = [fact(s, p, o) for s, p, o in valid_triples] test_facts = [fact(s, p, o) for s, p, o in test_triples] parser = KnowledgeBaseParser(train_facts + valid_facts + test_facts) clauses_path = args.clauses with open(clauses_path, 'r') as f: clauses = [parse_clause(line.strip()) for line in f.readlines()] for clause in clauses: logging.debug('Clause: {}'.format(clause)) # Put all triples in the form of sets of tuples train_triples = {(fact.argument_names[0], fact.predicate_name, fact.argument_names[1]) for fact in train_facts} valid_triples = {(fact.argument_names[0], fact.predicate_name, fact.argument_names[1]) for fact in valid_facts} test_triples = {(fact.argument_names[0], fact.predicate_name, fact.argument_names[1]) for fact in test_facts} m_train_facts = materialize(train_facts, clauses, parser) m_train_triples = {(fact.argument_names[0], fact.predicate_name, fact.argument_names[1]) for fact in m_train_facts} # Check if the sets of triples are non-empty assert len(train_triples) > 0 assert len(valid_triples) > 0 assert len(test_triples) > 0 assert len(m_train_triples) > len(train_triples) # Check that their intersections are empty (e.g. no test triple appear in the training set etc.) assert len(train_triples & valid_triples) == 0 assert len(train_triples & test_triples) == 0 assert len(valid_triples & test_triples) == 0 # Note that some of the test triples can be inferred by directly applying these rules on the training set # (pure logical inference). On each dataset, we further split the test set into two parts, test-I and test-II. # The former contains triples that cannot be directly inferred by pure logical inference, and the latter the # remaining test triples. Table 3 gives some statistics of the datasets, including the number of entities, # relations, triples in training/validation/test-I/test-II set, and ground rules. # Triples that cannot be directly inferred by pure logical inference test_1_triples = test_triples - m_train_triples # Triples that can be directly inferred by pure logical inference test_2_triples = test_triples & m_train_triples nb_1_triples, nb_2_triples, nb_all_triples = len(test_1_triples), len( test_2_triples), len(test_triples) assert nb_1_triples + nb_2_triples == nb_all_triples assert len(test_1_triples | test_2_triples) == nb_all_triples logger.info('#Test-I: {}, #Test-II: {}, #Test-ALL: {}'.format( nb_1_triples, nb_2_triples, nb_all_triples)) if test_I_path is not None: with open(test_I_path, 'w') as f: f.writelines( ['{}\t{}\t{}\n'.format(s, p, o) for s, p, o in test_1_triples]) if test_II_path is not None: with open(test_II_path, 'w') as f: f.writelines( ['{}\t{}\t{}\n'.format(s, p, o) for s, p, o in test_2_triples])
def main(argv): def formatter(prog): return argparse.HelpFormatter(prog, max_help_position=100, width=200) argparser = argparse.ArgumentParser('Populate a Knowledge Base', formatter_class=formatter) argparser.add_argument('triples', action='store', type=str, default=None) argparser.add_argument('clauses', action='store', type=str, default=None) argparser.add_argument('--output', '-o', action='store', type=str, default=None) args = argparser.parse_args(argv) triples_path = args.triples clauses_path = args.clauses output_path = args.output triples, _ = read_triples(triples_path) # Parse the clauses using Sebastian's parser with open(clauses_path, 'r') as f: clauses_str = [line.strip() for line in f.readlines()] clauses = [parse_clause(clause_str) for clause_str in clauses_str] # Create a set containing all the entities from the triples entity_names = {s for (s, _, _) in triples} | {o for (_, _, o) in triples} # Create a set containing all predicate names from the triples and clauses predicate_names = {p for (_, p, _) in triples} for clause in clauses: predicate_names |= {clause.head.predicate.name} for atom in clause.body: predicate_names |= {atom.predicate.name} # Associate each entity and predicate to an unique index entity_to_idx = {entity: idx for idx, entity in enumerate(entity_names)} idx_to_entity = {idx: entity for entity, idx in entity_to_idx.items()} predicate_to_idx = { predicate: idx for idx, predicate in enumerate(predicate_names) } idx_to_predicate = { idx: predicate for predicate, idx in predicate_to_idx.items() } logger.info('Asserting facts ..') # Asserting the facts for (s, p, o) in triples: pyDatalog.assert_fact('p', entity_to_idx[s], predicate_to_idx[p], entity_to_idx[o]) logger.info('Querying triples ..') ans = pyDatalog.ask('p(S, P, O)') print(len(ans.answers)) logger.info('Loading rules ..') def atom_to_str(atom): atom_predicate_idx = predicate_to_idx[atom.predicate.name] atom_arg_0, atom_arg_1 = atom.arguments[0], atom.arguments[1] return 'p({}, {}, {})'.format(atom_arg_0, atom_predicate_idx, atom_arg_1) def clause_to_str(clause): head, body = clause.head, clause.body return '{} <= {}'.format(atom_to_str(head), ' & '.join([atom_to_str(a) for a in body])) rules_str = '\n'.join([clause_to_str(clause) for clause in clauses]) pyDatalog.load(rules_str) logger.info('Querying triples ..') ans = pyDatalog.ask('p(S, P, O)') answers = sorted(ans.answers)
def main(argv): def formatter(prog): return argparse.HelpFormatter(prog, max_help_position=100, width=200) argparser = argparse.ArgumentParser('Plot Embeddings', formatter_class=formatter) argparser.add_argument('model', action='store', type=str) argparser.add_argument('adversary', action='store', type=str) argparser.add_argument('--clauses', '-c', action='store', type=str, default=None) argparser.add_argument('--triples', '-t', action='store', type=str, default=None) args = argparser.parse_args(argv) model_path = args.model adversary_path = args.adversary clauses_path = args.clauses triples_path = args.triples with open(model_path, 'rb') as f: model_data = pickle.load(f) with open(adversary_path, 'rb') as f: adversary_data = pickle.load(f) entity_embeddings = model_data['entities'][1:, :] predicate_embeddings = model_data['predicates'][1:, :] variables = adversary_data['variables'] entity_to_index = model_data['entity_to_index'] predicate_to_index = model_data['predicate_to_index'] entity_indices = sorted(set(entity_to_index.values())) predicate_indices = sorted(set(predicate_to_index.values())) triples = None if triples_path is not None: triples, _ = read_triples(triples_path) triples_idx = [(entity_to_index[s], predicate_to_index[p], entity_to_index[o]) for (s, p, o) in triples] clauses = None clause_to_ground_mappings = None if clauses_path is not None: with open(clauses_path, 'r') as f: clauses = [parse_clause(line.strip()) for line in f.readlines()] clause_to_variable_names = { clause: GroundLoss.get_variable_names(clause) for clause in clauses } clause_to_mappings = { clause: GroundLoss.sample_mappings(clause_to_variable_names[clause], entity_indices) for clause in clauses } if triples is not None: clause_to_ground_mappings = { clauses[0]: [{ 'X': s_idx, 'Y': o_idx } for (s_idx, _, o_idx) in triples_idx] } nb_entities = len(entity_to_index) for variable_name, embedding in variables.items(): variable_name = variable_name.split('_')[2] entity_to_index[variable_name] = len(entity_to_index) + 1 entity_embeddings = np.concatenate((entity_embeddings, embedding), axis=0) index_to_entity = { index: entity for entity, index in entity_to_index.items() } index_to_predicate = { index: predicate for predicate, index in predicate_to_index.items() } projector = MDS(n_components=2, random_state=0) np.set_printoptions(suppress=True) entity_embeddings_proj = projector.fit_transform(entity_embeddings) plt.scatter(entity_embeddings_proj[:nb_entities, 0], entity_embeddings_proj[:nb_entities, 1], color='c') plt.scatter(entity_embeddings_proj[nb_entities:, 0], entity_embeddings_proj[nb_entities:, 1], color='r') # Finding the maximum violators if clauses is not None: kwargs = { 'entity_to_index': entity_to_index, 'entity_embeddings': entity_embeddings, 'predicate_to_index': predicate_to_index, 'predicate_embeddings': predicate_embeddings, 'scoring_function': score_TransE_L1 } for clause, mappings in clause_to_mappings.items(): mapping_loss_lst = [(mapping, loss_clause(clause, mapping, **kwargs)) for mapping in mappings] # Find the most violating variable assignment (i.e. the variable assignment with the highest loss) import operator most_violating_mapping = max(mapping_loss_lst, key=operator.itemgetter(1))[0] logger.info( 'Most violating mapping: {}'.format(most_violating_mapping)) for variable_name, entity_idx in most_violating_mapping.items(): plt.scatter(entity_embeddings_proj[entity_idx - 1, 0], entity_embeddings_proj[entity_idx - 1, 1], color='b') if clause_to_ground_mappings is not None: for clause, mappings in clause_to_ground_mappings.items(): mapping_loss_lst = [(mapping, loss_clause(clause, mapping, **kwargs)) for mapping in mappings] # Find the most violating variable assignment (i.e. the variable assignment with the highest loss) import operator most_violating_mapping = max(mapping_loss_lst, key=operator.itemgetter(1))[0] logger.info('Most violating ground mapping: {}'.format( most_violating_mapping)) for variable_name, entity_idx in most_violating_mapping.items( ): plt.scatter(entity_embeddings_proj[entity_idx - 1, 0], entity_embeddings_proj[entity_idx - 1, 1], color='g') for index, (x, y) in enumerate( zip(entity_embeddings_proj[:, 0], entity_embeddings_proj[:, 1]), 1): label = index_to_entity[index] plt.annotate(label, xy=(x, y), xytext=(0, 0), textcoords='offset points') plt.show()
def test_losses(): hyperparam_configurations = list(cartesian_product(hyperparams)) for hyperparam_configuration in hyperparam_configurations: # Clauses clause = parse_clause(hyperparam_configuration['clause']) # Instantiating the model parameters model_class = models.get_function(hyperparam_configuration['model_name']) similarity_function = similarities.get_function('dot') unit_cube = hyperparam_configuration['unit_cube'] for seed in range(4): print('Seed {}, Evaluating {}'.format(seed, str(hyperparam_configuration))) tf.reset_default_graph() np.random.seed(seed) tf.set_random_seed(seed) entity_embedding_size = np.random.randint(low=1, high=5) * 2 predicate_embedding_size = entity_embedding_size # Instantiating entity and predicate embedding layers entity_embedding_layer = tf.get_variable('entities', shape=[nb_entities + 1, entity_embedding_size], initializer=tf.contrib.layers.xavier_initializer()) predicate_embedding_layer = tf.get_variable('predicates', shape=[nb_predicates + 1, predicate_embedding_size], initializer=tf.contrib.layers.xavier_initializer()) entity_projection = constraints.unit_sphere(entity_embedding_layer, norm=1.0) if unit_cube: entity_projection = constraints.unit_cube(entity_embedding_layer) entity_inputs = tf.placeholder(tf.int32, shape=[None, 2]) walk_inputs = tf.placeholder(tf.int32, shape=[None, None]) entity_embeddings = tf.nn.embedding_lookup(entity_embedding_layer, entity_inputs) predicate_embeddings = tf.nn.embedding_lookup(predicate_embedding_layer, walk_inputs) model_parameters = dict(entity_embeddings=entity_embeddings, predicate_embeddings=predicate_embeddings, similarity_function=similarity_function) model = model_class(**model_parameters) score = model() closed_form_lifted = ClosedForm(parser=parser, predicate_embedding_layer=predicate_embedding_layer, model_class=model_class, model_parameters=model_parameters, is_unit_cube=unit_cube) opt_adversarial_loss = closed_form_lifted(clause) v_optimizer = tf.train.AdagradOptimizer(learning_rate=1e-2) v_training_step = v_optimizer.minimize(opt_adversarial_loss, var_list=[predicate_embedding_layer]) init_op = tf.global_variables_initializer() with tf.Session() as session: session.run(init_op) session.run([entity_projection]) def scoring_function(args): return session.run(score, feed_dict={walk_inputs: args[0], entity_inputs: args[1]}) ground_loss = GroundLoss(clauses=[clause], parser=parser, scoring_function=scoring_function) feed_dict = {'X': a_idx, 'Y': b_idx} continuous_loss_0 = ground_loss.continuous_error(clause, feed_dict=feed_dict) for epoch in range(1, 100 + 1): _ = session.run([v_training_step]) print(ground_loss.continuous_error(clause, feed_dict=feed_dict)) continuous_loss_final = ground_loss.continuous_error(clause, feed_dict=feed_dict) assert continuous_loss_0 <= .0 or continuous_loss_final <= continuous_loss_0 tf.reset_default_graph()
def main(argv): def formatter(prog): return argparse.HelpFormatter(prog, max_help_position=100, width=200) argparser = argparse.ArgumentParser('Populate a Knowledge Base', formatter_class=formatter) argparser.add_argument('triples', action='store', type=str, default=None) argparser.add_argument('clauses', action='store', type=str, default=None) argparser.add_argument('--output', '-o', action='store', type=str, default=None) args = argparser.parse_args(argv) triples_path = args.triples clauses_path = args.clauses output_path = args.output triples, _ = read_triples(triples_path) # Parse the clauses using Sebastian's parser with open(clauses_path, 'r') as f: clauses_str = [line.strip() for line in f.readlines()] clauses = [parse_clause(clause_str) for clause_str in clauses_str] # Create a set containing all predicate names predicate_names = {p for (_, p, _) in triples} for clause in clauses: predicate_names |= {clause.head.predicate.name} for atom in clause.body: predicate_names |= {atom.predicate.name} # The original predicate names might not be handled well by Pyke (it's the case of e.g. Freebase) # Replace them with p1, p2, p3 etc. predicate_to_idx = {predicate: 'p{}'.format(idx) for idx, predicate in enumerate(predicate_names)} idx_to_predicate = {idx: predicate for predicate, idx in predicate_to_idx.items()} # Generate a Pyke rule base for reasoning via forward chaining rule_str_lst = [] for idx, clause in enumerate(clauses): head, body = clause.head, clause.body head_str = '\t\tfacts.{}(${}, ${})'.format(predicate_to_idx[head.predicate.name], head.arguments[0].name, head.arguments[1].name) body_str = '' for atom in body: body_str += '\t\tfacts.{}(${}, ${})\n'.format(predicate_to_idx[atom.predicate.name], atom.arguments[0].name, atom.arguments[1].name) rule_str_lst += ['rule_{}\n\tforeach\n{}\n\tassert\n{}\n'.format(idx, body_str, head_str)] # Write the Pyke rule base to file with open(RULES_KRB_PATH, 'w') as f: f.writelines('{}\n'.format(rule_str) for rule_str in rule_str_lst) engine = knowledge_engine.engine('.') # Assert starting facts, corresponding to the triples already in the Knowledge Graph for (s, p, o) in tqdm(triples): engine.assert_('facts', predicate_to_idx[p], (s, o)) engine.activate(os.path.splitext(os.path.basename(RULES_KRB_PATH))[0]) # For each predicate p, query the reasoning engine .. materialized_triples = [] for predicate_name in tqdm(predicate_names): # .. asking for all subject s and object o pairs such that (s, p, o) is entailed by the knowledge base with engine.prove_goal('facts.{}($s, $o)'.format(predicate_to_idx[predicate_name])) as gen: for vs, plan in gen: materialized_triples += [(vs['s'], predicate_name, vs['o'])] if output_path is not None: # Write the materialized triples to file with open(output_path, 'w') as f: f.writelines('{}\t{}\t{}\n'.format(s, p, o) for s, p, o in materialized_triples)