'/m/028cl7': [5, 1, '/m/03lty', '/music/genre/subgenre', '/m/028cl7', 'tail'], '/m/03rg5x': [ 4, 1, '/m/0cbd2', '/people/profession/people_with_this_profession', '/m/03rg5x', 'tail' ] } #### LOAD DATASET print("Loading dataset...") complex_dataset = Dataset(name=dataset_name, separator="\t", load=True) ### LOAD TRAINED ORIGINAL MODEL print("Loading original trained model...") original_model = ComplEx(dataset=complex_dataset, dimension=dimension, init_random=True, init_size=init) # type: ComplEx original_model.load_state_dict(torch.load(model_path)) original_model.to('cuda') ### FOR EACH ENTITY, PERFORM A KELPIE ANALYSIS for entity_to_explain in entity_2_params: train_degree, test_degree, head, relation, tail, perspective = entity_2_params[ entity_to_explain] print( "\nWorking with entity %s (train degree %i; test degree %i): explaining fact <%s, %s, %s>." % (entity_to_explain, train_degree, test_degree, head, relation, tail)) # get the ids of the elements of the fact to explain and the perspective entity head_id, relation_id, tail_id = complex_dataset.get_id_for_entity_name(head), \ complex_dataset.get_id_for_relation_name(relation), \
'/m/01tnbn': [72, 7, '/m/0cbd2', '/people/profession/people_with_this_profession', '/m/01tnbn', 'tail'], '/m/028d4v': [69, 11, '/m/028d4v', '/people/person/profession', '/m/0dxtg', 'head'], '/m/03193l': [43, 5, '/m/03193l', '/common/topic/webpage./common/webpage/category', '/m/08mbj32', 'head'], '/m/0g2ff': [29, 4, '/m/0g2ff', '/music/performance_role/regular_performances./music/group_membership/role', '/m/0mkg', 'head'], '/m/0269kx': [21, 1, '/m/0h52w', '/protected_sites/natural_or_cultural_site_designation/sites./protected_sites/natural_or_cultural_site_listing/listed_site', '/m/0269kx', 'tail'], '/m/028cl7': [5, 1, '/m/03lty', '/music/genre/subgenre', '/m/028cl7', 'tail'], '/m/03rg5x': [4, 1, '/m/0cbd2', '/people/profession/people_with_this_profession', '/m/03rg5x', 'tail'] } #### LOAD DATASET print("Loading dataset...") complex_dataset = Dataset(name=dataset_name, separator="\t", load=True) ### LOAD TRAINED ORIGINAL MODEL print("Loading original trained model...") original_model = ComplEx(dataset=complex_dataset, dimension=dimension, init_random=True, init_size=init) # type: ComplEx original_model.load_state_dict(torch.load(model_path)) original_model.to('cuda') ### FOR EACH ENTITY, PERFORM A KELPIE ANALYSIS for entity_to_explain in entity_2_params: train_degree, test_degree, head, relation, tail, perspective = entity_2_params[entity_to_explain] print("\nWorking with entity %s (train degree %i; test degree %i): explaining fact <%s, %s, %s>." % (entity_to_explain, train_degree, test_degree, head, relation, tail)) # get the ids of the elements of the fact to explain and the perspective entity head_id, relation_id, tail_id = complex_dataset.get_id_for_entity_name(head), \ complex_dataset.get_id_for_relation_name(relation), \ complex_dataset.get_id_for_entity_name(tail) original_entity_id = head_id if perspective == 'head' else tail_id
original_dataset.get_id_for_entity_name(tail) original_entity_id = head_id if args.perspective == "head" else tail_id # create the fact to explain as a numpy array of its ids original_triple = (head_id, relation_id, tail_id) original_sample = numpy.array(original_triple) # check that the fact to explain is actually a test fact assert(original_sample in original_dataset.test_samples) ############# INITIALIZE MODELS AND THEIR STRUCTURES print("Loading model at location %s..." % args.model_path) # instantiate and load the original model from filesystem original_model = ComplEx(dataset=original_dataset, dimension=args.dimension, init_random=True, init_size=args.init) # type: ComplEx original_model.load_state_dict(torch.load(args.model_path)) original_model.to('cuda') kelpie_dataset = KelpieDataset(dataset=original_dataset, entity_id=original_entity_id) ############ EXTRACT TEST FACTS AND TRAINING FACTS print("Extracting train and test samples for the original and the kelpie entities...") # extract all training facts and test facts involving the entity to explain # and replace the id of the entity to explain with the id of the fake kelpie entity original_test_samples = kelpie_dataset.original_test_samples kelpie_test_samples = kelpie_dataset.kelpie_test_samples kelpie_train_samples = kelpie_dataset.kelpie_train_samples
def rbo(original_model: ComplEx, kelpie_model: KelpieComplEx, original_samples: numpy.array, kelpie_samples: numpy.array): _, original_ranks, original_predictions = original_model.predict_samples( original_samples) _, kelpie_ranks, kelpie_predictions = kelpie_model.predict_samples( samples=kelpie_samples, original_mode=False) all_original_ranks = [] for (a, b) in original_ranks: all_original_ranks.append(a) all_original_ranks.append(b) all_kelpie_ranks = [] for (a, b) in kelpie_ranks: all_kelpie_ranks.append(a) all_kelpie_ranks.append(b) original_mrr = mrr(all_original_ranks) kelpie_mrr = mrr(all_kelpie_ranks) original_h1 = hits_k(all_original_ranks, 1) kelpie_h1 = hits_k(all_kelpie_ranks, 1) rbos = [] for i in range(len(original_samples)): _original_sample = original_samples[i] _kelpie_sample = kelpie_samples[i] original_target_head, _, original_target_tail = _original_sample kelpie_target_head, _, kelpie_target_tail = _kelpie_sample original_target_head_index, original_target_tail_index = int( original_ranks[i][0] - 1), int(original_ranks[i][1] - 1) kelpie_target_head_index, kelpie_target_tail_index = int( kelpie_ranks[i][0] - 1), int(kelpie_ranks[i][1] - 1) # get head and tail predictions original_head_predictions = original_predictions[i][0] kelpie_head_predictions = kelpie_predictions[i][0] original_tail_predictions = original_predictions[i][1] kelpie_tail_predictions = kelpie_predictions[i][1] assert original_head_predictions[ original_target_head_index] == original_target_head assert kelpie_head_predictions[ kelpie_target_head_index] == kelpie_target_head assert original_tail_predictions[ original_target_tail_index] == original_target_tail assert kelpie_tail_predictions[ kelpie_target_tail_index] == kelpie_target_tail # replace the target head id with the same value (-1 in this case) original_head_predictions[original_target_head_index] = -1 kelpie_head_predictions[kelpie_target_head_index] = -1 # cut both lists at the max rank that the target head obtained, between original and kelpie model original_head_predictions = original_head_predictions[: original_target_head_index + 1] kelpie_head_predictions = kelpie_head_predictions[: kelpie_target_head_index + 1] # replace the target tail id with the same value (-1 in this case) original_tail_predictions[original_target_tail_index] = -1 kelpie_tail_predictions[kelpie_target_tail_index] = -1 # cut both lists at the max rank that the target tail obtained, between original and kelpie model original_tail_predictions = original_tail_predictions[: original_target_tail_index + 1] kelpie_tail_predictions = kelpie_tail_predictions[: kelpie_target_tail_index + 1] rbos.append( ranking_similarity.rank_biased_overlap(original_head_predictions, kelpie_head_predictions)) rbos.append( ranking_similarity.rank_biased_overlap(original_tail_predictions, kelpie_tail_predictions)) avg_rbo = float(sum(rbos)) / float(len(rbos)) return avg_rbo, original_mrr, kelpie_mrr, original_h1, kelpie_h1
torch.backends.cudnn.deterministic = True if args.load is not None: model_path = args.load else: model_path = os.path.join(MODEL_PATH, "_".join(["ComplEx", args.dataset]) + ".pt") if not os.path.isdir(MODEL_PATH): os.makedirs(MODEL_PATH) print("Loading %s dataset..." % args.dataset) dataset = Dataset(name=args.dataset, separator="\t", load=True) print("Initializing model...") model = ComplEx(dataset=dataset, dimension=args.dimension, init_random=True, init_size=args.init) # type: ComplEx model.to('cuda') if args.load is not None: model.load_state_dict(torch.load(model_path)) print("Training model...") optimizer = MultiClassNLLptimizer(model=model, optimizer_name=args.optimizer, batch_size=args.batch_size, learning_rate=args.learning_rate, decay1=args.decay1, decay2=args.decay2, regularizer_name='N3', regularizer_weight=args.reg)
def compute_fact_relevance(model: ComplEx, dataset: Dataset, sample_to_explain, perspective="head", perturbation_step=0.05, lambd=1): head_id, relation_id, tail_id = sample_to_explain entity_to_explain_id = head_id if perspective == "head" else tail_id # get the embedding of the head entity, of the relation, and of the tail entity of the fact to explain head_embedding = model.entity_embeddings[head_id].detach().reshape( 1, model.dimension * 2) relation_embedding = model.relation_embeddings[relation_id].detach( ).reshape(1, model.dimension * 2) tail_embedding = model.entity_embeddings[tail_id].detach().reshape( 1, model.dimension * 2) # set the requires_grad flag of the embedding of the entity to explain to true entity_to_explain_embedding = head_embedding if perspective == "head" else tail_embedding entity_to_explain_embedding.requires_grad = True # compute the score of the fact, and extract the gradient of the embedding of the entity to explain # then, perturbate the embedding of the entity to explain score = model.score_embeddings(head_embedding, relation_embedding, tail_embedding) score.backward() gradient = entity_to_explain_embedding.grad[0] perturbed_entity_to_explain_embedding = entity_to_explain_embedding.detach( ) - perturbation_step * gradient.detach() # extract all training samples containing the entity to explain, and compute their scores samples_containing_entity_to_explain = numpy.array([ (h, r, t) for (h, r, t) in dataset.train_samples if entity_to_explain_id in [h, t] ]) original_scores = model.score(samples_containing_entity_to_explain) # extract the embeddings for the head entities, relations, and tail entities # of all training samples containing the entity to explain; head_embeddings = model.entity_embeddings[ samples_containing_entity_to_explain[:, 0]] relation_embeddings = model.relation_embeddings[ samples_containing_entity_to_explain[:, 1]] tail_embeddings = model.entity_embeddings[ samples_containing_entity_to_explain[:, 2]] # for the entity to explain, use the perturbed embedding rather than the original one for i in range(samples_containing_entity_to_explain.shape[0]): (h, r, t) = samples_containing_entity_to_explain[i] if h == entity_to_explain_id: head_embeddings[i] = perturbed_entity_to_explain_embedding elif t == entity_to_explain_id: tail_embeddings[i] = perturbed_entity_to_explain_embedding # compute the scores of all training samples containing the entity to explain # using its perturbed embedding rather than the original one perturbed_scores = model.score_embeddings( head_embeddings, relation_embeddings, tail_embeddings).detach().cpu().numpy() # now for each training sample containing the entity to explain you have # both the original score and the score computed with the perturbed embedding # so you can compute the relevance of that training sample as original_score - lambda * perturbed_score sample_2_relevance = {} for i in range(samples_containing_entity_to_explain.shape[0]): sample_2_relevance[tuple(samples_containing_entity_to_explain[i])] = ( original_scores[i] - lambd * perturbed_scores[i])[0] most_relevant_samples = sorted(sample_2_relevance.items(), key=lambda x: x[1], reverse=True) return most_relevant_samples
original_dataset.get_id_for_relation_name(relation), \ original_dataset.get_id_for_entity_name(tail) original_entity_id = head_id if args.perspective == "head" else tail_id # create the fact to explain as a numpy array of its ids original_sample_tuple = (head_id, relation_id, tail_id) original_sample = numpy.array(original_sample_tuple) # check that the fact to explain is actually a test fact assert(original_sample in original_dataset.test_samples) ############# INITIALIZE MODELS AND THEIR STRUCTURES print("Loading model at location %s..." % args.model_path) # instantiate and load the original model from filesystem original_model = ComplEx(dataset=original_dataset, dimension=args.dimension, init_random=True, init_size=args.init) # type: ComplEx original_model.load_state_dict(torch.load(args.model_path)) original_model.to('cuda') print("Wrapping the original model in a Kelpie explainable model...") # use model_to_explain to initialize the Kelpie model kelpie_dataset = KelpieDataset(dataset=original_dataset, entity_id=original_entity_id) kelpie_model = KelpieComplEx(model=original_model, dataset=kelpie_dataset, init_size=1e-3) # type: KelpieComplEx kelpie_model.to('cuda') ############ EXTRACT TEST FACTS AND TRAINING FACTS print("Extracting train and test samples for the original and the kelpie entities...") # extract all training facts and test facts involving the entity to explain # and replace the id of the entity to explain with the id of the fake kelpie entity