def _test_er_graph_export(self, dataset): model = dataset() entity, relation, triples, entity_pairs, true_pairs = model.get_er_model( ) graph = Graph_ER(dataset, rebuild=True) e, r, t, ep, tp = graph.load_kg_er_model() self.assertEqual(len(e), len(entity)) self.assertEqual(len(r), len(relation)) self.assertEqual(len(t), len(triples)) self.assertEqual(len(ep), len(entity_pairs)) self.assertEqual(len(tp), len(true_pairs))
def _test_transh(self, dataset, params): graph = Graph_ER(dataset) model = dataset() logger = get_logger('RL.Test.er.TransH.' + str(model)) transh = TransH(graph, dimension=params['dimension'], learning_rate=params['learning_rate'], margin=params['margin'], regularizer_scale=params['regularizer_scale'], batchSize=params['batchSize'], neg_rate=params['neg_rate'], neg_rel_rate=params['neg_rel_rate']) loss = transh.train(max_epochs=params['epochs']) logger.info("Training Complete with loss: %f", loss) ent_embeddings = transh.get_ent_embeddings() result_prob = [] for i in range(0, len(graph.entity_pairs)): distance = abs( spatial.distance.cosine( ent_embeddings[graph.entity_pairs[i][0]], ent_embeddings[graph.entity_pairs[i][1]])) result_prob.append( (graph.entity_pairs[i][0], graph.entity_pairs[i][1], distance)) #logger.info("i: %d, distance: %f true_pairs: %s", i, distance, graph.entity_pairs[i] in true_pairs) #Write Embeddings to file export_embeddings('er', str(model), 'TransH', graph.entity, ent_embeddings) export_result_prob(dataset, 'er', str(model), 'TransH', graph.entity, result_prob, graph.true_pairs) optimal_threshold, max_fscore = get_optimal_threshold( result_prob, graph.true_pairs) try: logger.info("MAX FSCORE: %f AT : %f", max_fscore, optimal_threshold) result = pd.MultiIndex.from_tuples([(e1, e2) for (e1, e2, d) in result_prob if d <= optimal_threshold]) params['threshold'] = optimal_threshold log_quality_results(logger, result, graph.true_pairs, len(graph.entity_pairs), params) export_false_negatives(dataset, 'er', str(model), 'TransH', graph.entity, result_prob, graph.true_pairs, result, graph.entity) export_false_positives(dataset, 'er', str(model), 'TransH', graph.entity, result_prob, graph.true_pairs, result, graph.entity) except: logger.info("Zero Reults") #Log MAP, MRR and Hits@K ir_metrics = InformationRetrievalMetrics(result_prob, graph.true_pairs) p_at_1 = ir_metrics.log_metrics(logger, params) transh.close_tf_session() return (max_fscore, p_at_1)
def _test_rl_transe(self, dataset, params): #Load Graph Data graph = Graph_ER(dataset) model = dataset() logger = get_logger('RL.Test.er.RLTransE.' + str(model)) transe = TransE(graph, dimension=params['dimension'], learning_rate=params['learning_rate'], margin=params['margin'], regularizer_scale=params['regularizer_scale'], batchSize=params['batchSize'], neg_rate=params['neg_rate'], neg_rel_rate=params['neg_rel_rate']) loss = transe.train(max_epochs=params['epochs']) logger.info("Training Complete with loss: %f", loss) ent_embeddings = transe.get_ent_embeddings() result_prob = [] for (a, b) in graph.entity_pairs: a_triples = [(h, t, r) for (h, t, r) in graph.triples if h == a] b_triples = [(h, t, r) for (h, t, r) in graph.triples if h == b] distance = abs( spatial.distance.cosine(ent_embeddings[a], ent_embeddings[b])) for (ah, at, ar) in a_triples: bt = [t for (h, t, r) in b_triples if r == ar] if len(bt): distance = distance + abs(spatial.distance.cosine(\ ent_embeddings[at], ent_embeddings[bt[0]])) result_prob.append((a, b, distance)) #logger.info("a: %d, b: %d distance: %f true_pairs: %s", a, b, distance, (a, b) in graph.true_pairs) #Write Embeddings to file export_embeddings('er', str(model), 'RLTransE', graph.entity, ent_embeddings) export_result_prob(dataset, 'er', str(model), 'RLTransE', graph.entity, result_prob, graph.true_pairs) optimal_threshold, max_fscore = get_optimal_threshold( result_prob, graph.true_pairs, max_threshold=3.0) try: params['threshold'] = optimal_threshold result = pd.MultiIndex.from_tuples([(e1, e2) for (e1, e2, d) in result_prob if d <= optimal_threshold]) log_quality_results(logger, result, graph.true_pairs, len(graph.entity_pairs), params) except: logger.info("Zero Reults") #Log MAP, MRR and Hits@K ir_metrics = InformationRetrievalMetrics(result_prob, graph.true_pairs) precison_at_1 = ir_metrics.log_metrics(logger, params) transe.close_tf_session() return (max_fscore, precison_at_1)
def test_cora(self, params=None): if not params: params = self.get_default_params() #Load Graph Data graph = Graph_ER(Cora) model = Cora() logger = get_logger('RL.Test.TransE.Household.' + str(model)) transe = TransE(graph, dimension=params['dimension'], learning_rate=params['learning_rate'], margin=params['margin'], regularizer_scale=params['regularizer_scale'], batchSize=params['batchSize'], neg_rate=params['neg_rate'], neg_rel_rate=params['neg_rel_rate']) loss = transe.train(max_epochs=params['epochs']) logger.info("Training Complete with loss: %f", loss) ent_embeddings = transe.get_ent_embeddings() #Experimenting household matching auth_rel_index = graph.relation.index('author') result_prob = [] for ep_index in range(0, len(graph.entity_pairs)): authors_A = [ t for (h, t, r) in graph.triples if h == graph.entity_pairs[ep_index][0] and r == auth_rel_index ] #logger.info("AUHTORS A: %s", str([graph.entity[a] for a in authors_A])) authors_B = [ t for (h, t, r) in graph.triples if h == graph.entity_pairs[ep_index][1] and r == auth_rel_index ] #logger.info("AUHTORS B: %s", str([graph.entity[a] for a in authors_B])) cost_matrix = np.zeros(shape=(len(authors_A), len(authors_B))) for i in range(len(authors_A)): for j in range(len(authors_B)): #if authors_A[i] == authors_B[j]: # cost_matrix[i][j] = 100 #else: cost_matrix[i][j] = abs( spatial.distance.cosine(ent_embeddings[authors_A[i]], ent_embeddings[authors_B[j]])) #logger.info("Cost Matrix: %s", str(cost_matrix)) row_ind, col_ind = linear_sum_assignment(cost_matrix) #logger.info("Cost of aligning = %f", cost_matrix[row_ind, col_ind].sum()) distance = cost_matrix[row_ind, col_ind].sum() + abs( spatial.distance.cosine( ent_embeddings[graph.entity_pairs[ep_index][0]], ent_embeddings[graph.entity_pairs[ep_index][1]])) result_prob.append((graph.entity_pairs[ep_index][0], graph.entity_pairs[ep_index][1], distance)) if distance <= 0.05: logger.info("i: %d, distance: %f true_pairs: %s", ep_index, distance, graph.entity_pairs[ep_index] in graph.true_pairs) export_embeddings('er', str(model), 'TransE.Household', graph.entity, ent_embeddings) export_result_prob(Cora, 'er', str(model), 'TransE.Household', graph.entity, result_prob, graph.true_pairs) optimal_threshold, max_fscore = get_optimal_threshold( result_prob, graph.true_pairs) try: params['threshold'] = optimal_threshold result = pd.MultiIndex.from_tuples([(e1, e2) for (e1, e2, d) in result_prob if d <= optimal_threshold]) log_quality_results(logger, result, graph.true_pairs, len(graph.entity_pairs), params) except: logger.info("Zero Reults") #Log MAP, MRR and Hits@K ir_metrics = InformationRetrievalMetrics(result_prob, graph.true_pairs) ir_metrics.log_metrics(logger, params) transe.close_tf_session() return max_fscore
def test_febrl(self, params=None): if not params: params = self.get_default_params() #Load Graph Data graph = Graph_ER(FEBRL) model = FEBRL() logger = get_logger('RL.Test.TransE.Household.' + str(model)) transe = TransE(graph, dimension=params['dimension'], learning_rate=params['learning_rate'], margin=params['margin'], regularizer_scale=params['regularizer_scale'], batchSize=params['batchSize'], neg_rate=params['neg_rate'], neg_rel_rate=params['neg_rel_rate']) loss = transe.train(max_epochs=params['epochs']) logger.info("Training Complete with loss: %f", loss) ent_embeddings = transe.get_ent_embeddings() #Experimenting household matching postcode_rel_id = graph.relation.index("postcode") result_prob = [] for i in range(0, len(graph.entity_pairs)): person_A = graph.entity_pairs[i][0] person_B = graph.entity_pairs[i][1] postcode_A = [ t for (h, t, r) in graph.triples if h == person_A and r == postcode_rel_id ][0] neighbours_A = [ h for (h, t, r) in graph.triples if t == postcode_A ] #logger.info("FM A: %s", str([graph.entity[a] for a in neighbours_A])) postcode_B = [ t for (h, t, r) in graph.triples if h == person_B and r == postcode_rel_id ][0] neighbours_B = [ h for (h, t, r) in graph.triples if t == postcode_B ] #logger.info("FM B: %s", str([graph.entity[a] for a in neighbours_B])) cost_matrix = np.zeros(shape=(len(neighbours_A), len(neighbours_B))) for i in range(len(neighbours_A)): for j in range(len(neighbours_B)): if neighbours_A[i] == neighbours_B[j]: cost_matrix[i][j] = 100 else: cost_matrix[i][j] = abs( spatial.distance.cosine( ent_embeddings[neighbours_A[i]], ent_embeddings[neighbours_B[j]])) #logger.info("Cost Matrix: %s", str(cost_matrix)) row_ind, col_ind = linear_sum_assignment(cost_matrix) #logger.info("Cost of aligning = %f", cost_matrix[row_ind, col_ind].sum()) person_A_index = neighbours_A.index(person_A) person_B_index = neighbours_B.index(person_B) distance = cost_matrix[row_ind, col_ind].sum( ) + cost_matrix[person_A_index][person_B_index] #import ipdb;ipdb.set_trace() #if (person_A_index, person_B_index) not in (row_ind, col_ind): # distance = distance + 1000 result_prob.append( (graph.entity_pairs[i][0], graph.entity_pairs[i][1], distance)) export_embeddings('er', str(model), 'TransE.Household', graph.entity, ent_embeddings) export_result_prob(FEBRL, 'er', str(model), 'TransE.Household', graph.entity, result_prob, graph.true_pairs) optimal_threshold, max_fscore = get_optimal_threshold( result_prob, graph.true_pairs) try: params['threshold'] = optimal_threshold result = pd.MultiIndex.from_tuples([(e1, e2) for (e1, e2, d) in result_prob if d <= optimal_threshold]) log_quality_results(logger, result, graph.true_pairs, len(graph.entity_pairs), params) except: logger.info("Zero Reults") #Log MAP, MRR and Hits@K ir_metrics = InformationRetrievalMetrics(result_prob, graph.true_pairs) ir_metrics.log_metrics(logger, params) transe.close_tf_session() return max_fscore
def test_census(self, params=None): if not params: params = self.get_default_params() #Load Graph Data graph = Graph_ER(Census) model = Census() logger = get_logger('RL.Test.TransE.Household.' + str(model)) transe = TransE(graph, dimension=params['dimension'], learning_rate=params['learning_rate'], margin=params['margin'], regularizer_scale=params['regularizer_scale'], batchSize=params['batchSize'], neg_rate=params['neg_rate'], neg_rel_rate=params['neg_rel_rate']) loss = transe.train(max_epochs=params['epochs']) logger.info("Training Complete with loss: %f", loss) ent_embeddings = transe.get_ent_embeddings() #Experimenting household matching result_prob = [] for ep_index in range(0, len(graph.entity_pairs)): #logger.info("Computing cost for: %s", str([graph.entity[e] for e in graph.entity_pairs[ep_index]])) household_A = [ t for (h, t, r) in graph.triples if h == graph.entity_pairs[ep_index][0] and r > 6 ][0] family_members_A = [ h for (h, t, r) in graph.triples if t == household_A ] #logger.info("FM A: %s", str([graph.entity[a] for a in family_members_A])) household_B = [ t for (h, t, r) in graph.triples if h == graph.entity_pairs[ep_index][1] and r > 6 ][0] family_members_B = [ h for (h, t, r) in graph.triples if t == household_B ] #logger.info("FM B: %s", str([graph.entity[a] for a in family_members_B])) cost_matrix = np.zeros(shape=(len(family_members_A), len(family_members_B))) for i in range(len(family_members_A)): for j in range(len(family_members_B)): #if family_members_A[i] == family_members_B[j]: # cost_matrix[i][j] = 100 #else: cost_matrix[i][j] = abs( spatial.distance.cosine( ent_embeddings[family_members_A[i]], ent_embeddings[family_members_B[j]])) #logger.info("Cost Matrix: %s", str(cost_matrix)) row_ind, col_ind = linear_sum_assignment(cost_matrix) #logger.info("Cost of aligning = %f", cost_matrix[row_ind, col_ind].sum()) #logger.info("Rows selected %s, Col selected: %s", str(row_ind), str(col_ind)) eA_index = family_members_A.index(graph.entity_pairs[ep_index][0]) eB_index = family_members_B.index(graph.entity_pairs[ep_index][1]) #logger.info("A index: %d, B index: %d", eA_index, eB_index) rowA = np.where(row_ind == eA_index)[0] if len(rowA) and col_ind[rowA[0]] == eB_index: #logger.info("Pair in min. cost matrix") distance = cost_matrix[row_ind, col_ind].sum() else: distance = cost_matrix[row_ind, col_ind].sum() + abs( spatial.distance.cosine( ent_embeddings[graph.entity_pairs[ep_index][0]], ent_embeddings[graph.entity_pairs[ep_index][1]])) result_prob.append( (graph.entity_pairs[i][0], graph.entity_pairs[i][1], distance)) if ep_index % 1000 == 0: logger.info("i: %d, distance: %f true_pairs: %s", ep_index, distance, graph.entity_pairs[ep_index] in graph.true_pairs) #if graph.entity_pairs[ep_index] in graph.true_pairs: # import ipdb;ipdb.set_trace() #Normalize distance max_distance = 10 #for r in result_prob: # if r[2] > max_distance: # max_distance = r[2] result_prob = [(r[0], r[1], (r[2] / max_distance)) for r in result_prob] #logger.info("Max distance: %f", max_distance) for r in result_prob[:100]: logger.info("distance: %f true_pairs: %s", r[2], (r[0], r[1]) in graph.true_pairs) export_embeddings('er', str(model), 'TransE.Household', graph.entity, ent_embeddings) export_result_prob(Census, 'er', str(model), 'TransE.Household', graph.entity, result_prob, graph.true_pairs) optimal_threshold, max_fscore = get_optimal_threshold( result_prob, graph.true_pairs) try: params['threshold'] = optimal_threshold result = pd.MultiIndex.from_tuples([(e1, e2) for (e1, e2, d) in result_prob if d <= optimal_threshold]) log_quality_results(logger, result, graph.true_pairs, len(graph.entity_pairs), params) except: logger.info("Zero Reults") #Log MAP, MRR and Hits@K ir_metrics = InformationRetrievalMetrics(result_prob, graph.true_pairs) ir_metrics.log_metrics(logger, params) transe.close_tf_session() return max_fscore
def _test_werl(self, model, columns, params): #Load Graph Data dataset = model() logger = get_logger('RL.Test.WERL.' + str(dataset)) ea_params = self.get_optimal_ea_params(model, params['ea_method']) if params['ea_method'] in [TransE, TransH]: #ER methods graph = Graph_ER(model) #Train TransE embedding vectors transe = params['ea_method']( graph, dimension=ea_params['dimension'], learning_rate=ea_params['learning_rate'], margin=ea_params['margin'], regularizer_scale=ea_params['regularizer_scale'], batchSize=ea_params['batchSize'], neg_rate=ea_params['neg_rate'], neg_rel_rate=ea_params['neg_rel_rate']) try: #raise Exception("Reset") transe.restore_model( self._get_tf_model_filename(dataset, transe)) except Exception as e: logger.error(e) loss = transe.train(max_epochs=ea_params['epochs']) logger.info("Training Complete with loss: %f", loss) transe.save_model(self._get_tf_model_filename(dataset, transe)) ent_embeddings = transe.get_ent_embeddings() rel_embeddings = None entity = graph.entity transe.close_tf_session() elif params['ea_method'] in [RLTransE]: #VEG methods graph = Graph_VEG(model) #Train TransE embedding vectors rltranse = params['ea_method']( graph, dimension=ea_params['dimension'], learning_rate=ea_params['learning_rate'], margin=ea_params['margin'], regularizer_scale=ea_params['regularizer_scale'], batchSize=ea_params['batchSize'], neg_rate=ea_params['neg_rate'], neg_rel_rate=ea_params['neg_rel_rate']) try: #raise Exception("Reset") rltranse.restore_model( self._get_tf_model_filename(dataset, rltranse)) except Exception as e: logger.error(e) loss, val_loss = rltranse.train(max_epochs=ea_params['epochs']) logger.info("Training Complete with loss: %f", loss) rltranse.save_model( self._get_tf_model_filename(dataset, rltranse)) val_embeddings = rltranse.get_val_embeddings() rel_embeddings = rltranse.get_rel_embeddings() if model == Census: #hack: census veg graph has 8 relations. we need only 6 #removing same_as and surname2 embedding. rel_embeddings = np.append(rel_embeddings[1:3], rel_embeddings[4:], axis=0) ent_embeddings = [] entity = [] for rel in val_embeddings: val_count = len(graph.relation_value_map[rel]) entity.extend(graph.relation_value_map[rel]) ent_embeddings.extend(val_embeddings[rel][:val_count]) assert len(ent_embeddings) == len(entity) rltranse.close_tf_session() elif params['ea_method'] in [VEER]: veer = VEER(model, columns, dimension=ea_params['dimension'], learning_rate=ea_params['learning_rate'], margin=ea_params['margin'], regularizer_scale=ea_params['regularizer_scale'], batchSize=ea_params['batchSize']) try: veer.restore_model(self._get_tf_model_filename(dataset, veer)) except Exception as e: logger.error(e) #Train Model loss, val_loss = veer.train(max_epochs=ea_params['epochs']) logger.info("Training Complete with loss: %f, val_loss:%f", loss, val_loss) veer.save_model(self._get_tf_model_filename(dataset, veer)) ent_embeddings = veer.get_val_embeddings() rel_embeddings = None entity = veer.get_values() veer.close_tf_session() else: raise Exception("Unknown Entity Alignment method") #Train WERL weights werl = WERL(model, columns, entity, ent_embeddings, rel_embeddings, learning_rate=params['learning_rate'], margin=params['margin'], regularizer_scale=params['regularizer_scale'], batchSize=params['batchSize']) loss, val_loss = werl.train(max_epochs=params['epochs']) logger.info("Training Complete with loss: %f, val_loss:%f", loss, val_loss) #Test Model result_prob, accuracy = werl.test() logger.info("Predict count: %d", len(result_prob)) logger.info( "Sample Prob: %s", str([(c, (a, b) in dataset.true_test_links) for (a, b, c) in result_prob[:20]])) logger.info("Column Weights: %s", str(werl.get_col_weights())) logger.info("Accuracy: %s", str(accuracy)) #Compute Performance measures optimal_threshold, max_fscore = get_optimal_threshold( result_prob, dataset.true_test_links, max_threshold=2.0) try: params['threshold'] = optimal_threshold result = pd.MultiIndex.from_tuples([(e1, e2) for (e1, e2, d) in result_prob if d <= optimal_threshold]) log_quality_results(logger, result, dataset.true_test_links, len(dataset.test_links), params) except Exception as e: logger.info("Zero Reults") logger.error(e) #Test Model logger = get_logger('RL.Test.MERL.' + str(dataset)) result_prob, accuracy = werl.test_merl() logger.info("Predict count: %d", len(result_prob)) logger.info( "Sample Prob: %s", str([(c, (a, b) in dataset.true_test_links) for (a, b, c) in result_prob[:20]])) logger.info("Column Weights: %s", str(werl.get_col_weights())) logger.info("Accuracy: %s", str(accuracy)) #Compute Performance measures optimal_threshold, max_fscore = get_optimal_threshold( result_prob, dataset.true_test_links, max_threshold=2.0) try: params['threshold'] = optimal_threshold result = pd.MultiIndex.from_tuples([(e1, e2) for (e1, e2, d) in result_prob if d <= optimal_threshold]) log_quality_results(logger, result, dataset.true_test_links, len(dataset.test_links), params) except Exception as e: logger.info("Zero Reults") logger.error(e) #Log MAP, MRR and Hits@K #ir_metrics = InformationRetrievalMetrics(result_prob, dataset.true_test_links) precison_at_1 = None #ir_metrics.log_metrics(logger, params) #Test Without Weights = Mean Emebedding for Record Linkage logger = get_logger('RL.Test.NoWT.' + str(dataset)) result_prob, accuracy = werl.test_without_weight() logger.info("Predict count: %d", len(result_prob)) logger.info( "Sample Prob: %s", str([(c, (a, b) in dataset.true_test_links) for (a, b, c) in result_prob[:20]])) logger.info("Column Weights: %s", str(werl.get_col_weights())) logger.info("Accuracy: %s", str(accuracy)) #Compute Performance measures optimal_threshold, nowt_max_fscore = get_optimal_threshold( result_prob, dataset.true_test_links, max_threshold=2.0) try: params['threshold'] = optimal_threshold result = pd.MultiIndex.from_tuples([(e1, e2) for (e1, e2, d) in result_prob if d <= optimal_threshold]) log_quality_results(logger, result, dataset.true_test_links, len(dataset.test_links), params) except Exception as e: logger.info("Zero Reults") logger.error(e) #Log MAP, MRR and Hits@K #ir_metrics = InformationRetrievalMetrics(result_prob, dataset.true_test_links) #nowt_precison_at_1 = ir_metrics.log_metrics(logger, params) werl.close_tf_session() return (max_fscore, precison_at_1)