def _test_transh(self, dataset, params): graph = Graph_ER(dataset) model = dataset() logger = get_logger('RL.Test.er.TransH.' + str(model)) transh = TransH(graph, dimension=params['dimension'], learning_rate=params['learning_rate'], margin=params['margin'], regularizer_scale=params['regularizer_scale'], batchSize=params['batchSize'], neg_rate=params['neg_rate'], neg_rel_rate=params['neg_rel_rate']) loss = transh.train(max_epochs=params['epochs']) logger.info("Training Complete with loss: %f", loss) ent_embeddings = transh.get_ent_embeddings() result_prob = [] for i in range(0, len(graph.entity_pairs)): distance = abs( spatial.distance.cosine( ent_embeddings[graph.entity_pairs[i][0]], ent_embeddings[graph.entity_pairs[i][1]])) result_prob.append( (graph.entity_pairs[i][0], graph.entity_pairs[i][1], distance)) #logger.info("i: %d, distance: %f true_pairs: %s", i, distance, graph.entity_pairs[i] in true_pairs) #Write Embeddings to file export_embeddings('er', str(model), 'TransH', graph.entity, ent_embeddings) export_result_prob(dataset, 'er', str(model), 'TransH', graph.entity, result_prob, graph.true_pairs) optimal_threshold, max_fscore = get_optimal_threshold( result_prob, graph.true_pairs) try: logger.info("MAX FSCORE: %f AT : %f", max_fscore, optimal_threshold) result = pd.MultiIndex.from_tuples([(e1, e2) for (e1, e2, d) in result_prob if d <= optimal_threshold]) params['threshold'] = optimal_threshold log_quality_results(logger, result, graph.true_pairs, len(graph.entity_pairs), params) export_false_negatives(dataset, 'er', str(model), 'TransH', graph.entity, result_prob, graph.true_pairs, result, graph.entity) export_false_positives(dataset, 'er', str(model), 'TransH', graph.entity, result_prob, graph.true_pairs, result, graph.entity) except: logger.info("Zero Reults") #Log MAP, MRR and Hits@K ir_metrics = InformationRetrievalMetrics(result_prob, graph.true_pairs) p_at_1 = ir_metrics.log_metrics(logger, params) transh.close_tf_session() return (max_fscore, p_at_1)
def _test_seea(self, dataset, params): model = dataset() graph = Graph_EAR(dataset) logger = get_logger('RL.Test.ear.SEEA.' + str(model)) seea = SEEA(graph, dimension=params['dimension'], learning_rate=params['learning_rate'], batchSize=params['batchSize'], margin=params['margin'], regularizer_scale=params['regularizer_scale'], neg_rate=params['neg_rate'], neg_rel_rate=params['neg_rel_rate']) #Begin SEEA iterations, passing true pairs only to debug the alignments. results = seea.seea_iterate(beta=params['beta'], max_iter=params['max_iter'], max_epochs=params['max_epochs']) try: result_pairs = pd.MultiIndex.from_tuples(results) fscore = log_quality_results(logger, result_pairs, graph.true_pairs, len(graph.entity_pairs), params) except Exception as e: logger.error(e) logger.info("No Aligned pairs found.") ent_embeddings = seea.get_ent_embeddings() export_embeddings('ear', str(model), 'SEEA', graph.entity, ent_embeddings) result_prob = [] for (e1, e2) in graph.entity_pairs: distance = abs( spatial.distance.cosine(ent_embeddings[e1], ent_embeddings[e2])) result_prob.append((e1, e2, distance)) export_result_prob(dataset, 'ear', str(model), 'SEEA', graph.entity, result_prob, graph.true_pairs) try: export_false_negatives(dataset, 'ear', str(model), 'SEEA', graph.entity, result_prob, graph.true_pairs, result_pairs, graph.entity) export_false_positives(dataset, 'ear', str(model), 'SEEA', graph.entity, result_prob, graph.true_pairs, result_pairs, graph.entity) except Exception as e: logger.error(e) #Log MAP, MRR and Hits@K ir_metrics = InformationRetrievalMetrics(result_prob, graph.true_pairs) prec_at_1 = ir_metrics.log_metrics(logger, params) seea.close_tf_session() return (fscore, prec_at_1)
def _test_veer(self, model, columns, params): #Load Graph Data dataset = model() logger = get_logger('RL.Test.VEER.' + str(dataset)) veer = VEER(model, columns, dimension=params['dimension'], learning_rate=params['learning_rate'], margin=params['margin'], regularizer_scale=params['regularizer_scale'], batchSize=params['batchSize']) #Train Model loss, val_loss = veer.train(max_epochs=params['epochs']) logger.info("Training Complete with loss: %f, val_loss:%f", loss, val_loss) #Test Model result_prob, accuracy = veer.test() logger.info("Predict count: %d", len(result_prob)) logger.info( "Sample Prob: %s", str([(c, (a, b) in dataset.true_test_links) for (a, b, c) in result_prob[:20]])) logger.info("Column Weights: %s", str(veer.get_col_weights())) logger.info("Accuracy: %s", str(accuracy)) logger.info("Sample embeddings: %s", str(veer.get_val_embeddings()[0])) #Compute Performance measures optimal_threshold, max_fscore = get_optimal_threshold( result_prob, dataset.true_test_links, max_threshold=2.0) try: params['threshold'] = optimal_threshold result = pd.MultiIndex.from_tuples([(e1, e2) for (e1, e2, d) in result_prob if d <= optimal_threshold]) log_quality_results(logger, result, dataset.true_test_links, len(dataset.test_links), params) except Exception as e: logger.info("Zero Reults") logger.error(e) #Log MAP, MRR and Hits@K ir_metrics = InformationRetrievalMetrics(result_prob, dataset.true_test_links) precison_at_1 = ir_metrics.log_metrics(logger, params) #Write Result Prob to file entitiesA = dataset.get_entity_names(dataset.testDataA) entitiesB = dataset.get_entity_names(dataset.testDataB) index_dictA = { str(dataset.testDataA.iloc[i]._name): i for i in range(dataset.testDataA.shape[0]) } index_dictB = { str(dataset.testDataB.iloc[i]._name): i for i in range(dataset.testDataB.shape[0]) } result_prob = [(index_dictA[str(a)], index_dictB[str(b)], p) for (a, b, p) in result_prob] true_links = [(index_dictA[str(a)], index_dictB[str(b)]) for (a, b) in dataset.true_test_links] export_result_prob(dataset, 'veg', str(dataset), 'VEER', entitiesA, result_prob, true_links, entitiesB) result = [(index_dictA[str(a)], index_dictB[str(b)]) for (a, b) in result] export_false_negatives(model, 'veg', str(dataset), 'VEER', entitiesA, result_prob, true_links, result, entitiesB) export_false_positives(model, 'veg', str(dataset), 'VEER', entitiesA, result_prob, true_links, result, entitiesB) veer.close_tf_session() return (max_fscore, precison_at_1)
def _test_rl_transe(self, model, field_relation_map, params): dataset = model() graph = Graph_VEG(model) logger = get_logger("RL.Test.RLTransE." + str(dataset)) logger.info("values for name : %s", str(graph.relation_value_map[graph.relation[1]][:10])) logger.info("relation: %s", str(graph.relation)) logger.info("train_triples: %s", str(graph.train_triples[:10])) logger.info("set train_triples size %d", len(set(graph.train_triples))) transe = RLTransE(graph, dimension=params['dimension'], learning_rate=params['learning_rate'], margin=params['margin'], regularizer_scale=params['regularizer_scale'], batchSize=params['batchSize'], neg_rate=params['neg_rate'], neg_rel_rate=params['neg_rel_rate']) loss, val_loss = transe.train(max_epochs=params['epochs']) logger.info("Training Complete with loss: %f val_loss: %f", loss, val_loss) value_embeddings = transe.get_val_embeddings() relation_embeddings = transe.get_rel_embeddings() result_prob = [] distance_distribution = [] missing_values = [] for (a, b) in dataset.test_links: row_a = dataset.testDataA.loc[a] row_b = dataset.testDataB.loc[b] distance = 0 dd = [] for f in field_relation_map: val_a = row_a[f] val_b = row_b[f] if val_a == val_b: dd.append(0) else: rel = field_relation_map[f] try: val_index_a = graph.relation_value_map[rel].index( val_a) except ValueError: missing_values.append(val_a) distance = distance + 1 dd.append(1) continue try: val_index_b = graph.relation_value_map[rel].index( val_b) except ValueError: missing_values.append(val_b) distance = distance + 1 dd.append(1) continue rel_index = graph.relation.index(field_relation_map[f]) cur_distance = abs( spatial.distance.cosine( value_embeddings[rel][val_index_a] + relation_embeddings[rel_index], value_embeddings[rel][val_index_b])) distance = distance + cur_distance dd.append(cur_distance) result_prob.append((a, b, distance)) distance_distribution.append((a, b, dd, distance)) #logger.info("a: %d, b: %d distance: %f true_pairs: %s", a, b, distance, (a,b) in dataset.true_test_links) logger.info("No. of missing values: %d", len(missing_values)) logger.info("Unique No. of missing values: %d", len(set(missing_values))) try: entities = ["value\trelation"] for r in graph.relation_value_map: for v in graph.relation_value_map[r]: entities.append("\t".join([v, r])) embeddings = [] for rel in value_embeddings: val_count = len(graph.relation_value_map[rel]) embeddings.extend(value_embeddings[rel][:val_count]) #Write Embeddings to file export_embeddings('veg', str(dataset), 'RLTransE_val', entities, embeddings) export_embeddings('veg', str(dataset), 'RLTransE_rel', graph.relation, relation_embeddings) except Exception as e: logger.error("Failed to export embeddings") logger.error(e) optimal_threshold, max_fscore = get_optimal_threshold( result_prob, dataset.true_test_links, max_threshold=3.0, step=0.02) try: params['threshold'] = optimal_threshold result = pd.MultiIndex.from_tuples([(e1, e2) for (e1, e2, d) in result_prob if d <= optimal_threshold]) log_quality_results(logger, result, dataset.true_test_links, len(dataset.test_links), params) except: logger.info("Zero Reults") #Log MAP, MRR and Hits@K ir_metrics = InformationRetrievalMetrics(result_prob, dataset.true_test_links) precison_at_1 = ir_metrics.log_metrics(logger, params) transe.close_tf_session() #Export False Positives and result porobabilities get_entity_name = lambda d, i: "_".join([ str(d.iloc[i][dataset.field_map[CensusFields.ID_INDIVIDUAL]]), str(d.iloc[i][dataset.field_map[CensusFields.DNI]]) ]) get_entity_name_loc = lambda d, i: "_".join([ str(d.loc[i][dataset.field_map[CensusFields.ID_INDIVIDUAL]]), str(d.loc[i][dataset.field_map[CensusFields.DNI]]) ]) entitiesA = [ get_entity_name(dataset.testDataA, i) for i in range(int(dataset.testDataA.shape[0])) ] entitiesB = [ get_entity_name(dataset.testDataB, i) for i in range(int(dataset.testDataB.shape[0])) ] result_prob = [ (entitiesA.index(get_entity_name_loc(dataset.testDataA, int(a))), entitiesB.index(get_entity_name_loc(dataset.testDataB, int(b))), p) for (a, b, p) in result_prob ] true_links = [ (entitiesA.index(get_entity_name_loc(dataset.testDataA, int(a))), entitiesB.index(get_entity_name_loc(dataset.testDataB, int(b)))) for (a, b) in dataset.true_test_links ] export_result_prob(Census, 'veg', 'census', 'rltranse', entitiesA, result_prob, true_links, entitiesB) distance_distribution = [ (entitiesA.index(get_entity_name_loc(dataset.testDataA, int(a))), entitiesB.index(get_entity_name_loc(dataset.testDataB, int(b))), [str("%.2f" % (float(w))) for w in dd], 1 - d) for (e1, e2, dd, d) in distance_distribution if (e1, e2) in result ] export_human_readable_results(Census, 'veg', 'census', 'rltranse', entitiesA, distance_distribution, entitiesB) result = [ (entitiesA.index(get_entity_name_loc(dataset.testDataA, int(a))), entitiesB.index(get_entity_name_loc(dataset.testDataB, int(b)))) for (a, b) in result ] export_false_negatives(Census, 'veg', 'census', 'rltranse', entitiesA, result_prob, true_links, result, entitiesB) export_false_positives(Census, 'veg', 'census', 'rltranse', entitiesA, result_prob, true_links, result, entitiesB) return (max_fscore, precison_at_1)
def test_veer(self): logger = get_logger('RL.Test.VEER.Census') dataset = Census() #Columns of interest for Sant Feliu town columns = [ 'Noms_harmo', 'cognom_1', 'cohort', 'estat_civil', 'parentesc_har', 'ocupacio_hisco' ] params = { 'learning_rate': 0.1, 'margin': 0.1, 'dimension': 32, 'epochs': 50, 'regularizer_scale': 0.1, 'batchSize': 512 } veer = VEER(Census, columns, dimension=params['dimension'], learning_rate=params['learning_rate'], margin=params['margin'], regularizer_scale=params['regularizer_scale'], batchSize=params['batchSize']) #Train Model loss, val_loss = veer.train(max_epochs=params['epochs']) logger.info("Training Complete with loss: %f, val_loss:%f", loss, val_loss) #Test Model result_prob, accuracy = veer.test() logger.info("Predict count: %d", len(result_prob)) logger.info( "Sample Prob: %s", str([(c, (a, b) in dataset.true_test_links) for (a, b, c) in result_prob[:20]])) logger.info("Column Weights: %s", str(veer.get_col_weights())) logger.info("Accuracy: %s", str(accuracy)) logger.info("Sample embeddings: %s", str(veer.get_val_embeddings()[0])) #Compute Performance measures optimal_threshold, max_fscore = get_optimal_threshold( result_prob, dataset.true_test_links, max_threshold=2.0) try: params['threshold'] = optimal_threshold result = pd.MultiIndex.from_tuples([(e1, e2) for (e1, e2, d) in result_prob if d <= optimal_threshold]) log_quality_results(logger, result, dataset.true_test_links, len(dataset.test_links), params) except Exception as e: logger.info("Zero Reults") logger.error(e) #Log MAP, MRR and Hits@K ir_metrics = InformationRetrievalMetrics(result_prob, dataset.true_test_links) precison_at_1 = ir_metrics.log_metrics(logger, params) #Export embeddings embeddings = veer.get_val_embeddings() export_embeddings('veg', 'census', 'veer', veer.values, embeddings) #Write Result Prob to file result_feature_mapping = [(e1, e2, [ str( abs( spatial.distance.cosine( embeddings[veer.values.index( veer._clean(dataset.testDataA.loc[e1][c]))], embeddings[veer.values.index( veer._clean(dataset.testDataB.loc[e2][c]))]))) for c in columns ], d) for (e1, e2, d) in result_prob if (e1, e2) in result] entitiesA = dataset.get_entity_names(dataset.testDataA) entitiesB = dataset.get_entity_names(dataset.testDataB) index_dictA = { str(dataset.testDataA.iloc[i]._name): i for i in range(dataset.testDataA.shape[0]) } index_dictB = { str(dataset.testDataB.iloc[i]._name): i for i in range(dataset.testDataB.shape[0]) } result_prob = [(index_dictA[str(a)], index_dictB[str(b)], p) for (a, b, p) in result_prob] export_result_prob(dataset, 'veg', str(dataset), 'VEER', entitiesA, result_prob, dataset.true_test_links, entitiesB) export_false_negatives(Census, 'veg', str(dataset), 'VEER', entitiesA, result_prob, dataset.true_test_links, result, entitiesB) export_false_positives(Census, 'veg', str(dataset), 'VEER', entitiesA, result_prob, dataset.true_test_links, result, entitiesB) result_feature_mapping = [(index_dictA[str(a)], index_dictB[str(b)], w, p) for (a, b, w, p) in result_feature_mapping] export_human_readable_results(Census, 'veg', str(dataset), 'VEER', entitiesA, result_feature_mapping, entitiesB) veer.close_tf_session()
def test_logistic(self): logger = get_logger('RL.Test.LogisticRegression.Census') census = Census() compare_cl = census.get_comparision_object() features = compare_cl.compute(census.candidate_links, census.trainDataA, census.trainDataB) logger.info("Train Features %s", str(features.describe())) # Train ECM Classifier logrg = recordlinkage.LogisticRegressionClassifier() logrg.fit(features, census.true_links) result = logrg.predict(features) log_quality_results(logger, result, census.true_links, len(census.candidate_links)) #Validate the classifier compare_cl = census.get_comparision_object() features = compare_cl.compute(census.val_links, census.valDataA, census.valDataB) logger.info("Validation Features %s", str(features.describe())) result = logrg.predict(features) log_quality_results(logger, result, census.true_val_links, len(census.val_links)) #Test the classifier compare_cl = census.get_comparision_object() features = compare_cl.compute(census.test_links, census.testDataA, census.testDataB) logger.info("Test Features %s", str(features.describe())) result = logrg.predict(features) log_quality_results(logger, result, census.true_test_links, len(census.test_links)) logger.info("logrg coefficients: %s", str(logrg.coefficients)) #Log IR Stats: MRR, MAP, MP@K prob_series = logrg.prob(features) prob = [(1 - p) for p in prob_series.tolist()] result_prob = [(census.test_links[i][0], census.test_links[i][1], prob[i]) for i in range(0, len(prob))] ir_metrics = InformationRetrievalMetrics(result_prob, census.true_test_links) ir_metrics.log_metrics(logger) #Export False Positives and result porobabilities result_feature_mapping = [ (e1, e2, [str(v) for v in features.loc[(e1, e2)].values], d) for (e1, e2, d) in result_prob if (e1, e2) in result ] get_entity_name = lambda c, d, i: "_".join([ str(d.iloc[i][c.field_map[CensusFields.ID_INDIVIDUAL]]), str(d.iloc[i][c.field_map[CensusFields.DNI]]) ]) get_entity_name_loc = lambda c, d, i: "_".join([ str(d.loc[i][c.field_map[CensusFields.ID_INDIVIDUAL]]), str(d.loc[i][c.field_map[CensusFields.DNI]]) ]) entitiesA = [ get_entity_name(census, census.testDataA, i) for i in range(int(census.testDataA.shape[0])) ] entitiesB = [ get_entity_name(census, census.testDataB, i) for i in range(int(census.testDataB.shape[0])) ] result_prob = [(entitiesA.index( get_entity_name_loc(census, census.testDataA, int(a))), entitiesB.index( get_entity_name_loc(census, census.testDataB, int(b))), p) for (a, b, p) in result_prob] true_links = [(entitiesA.index( get_entity_name_loc(census, census.testDataA, int(a))), entitiesB.index( get_entity_name_loc(census, census.testDataB, int(b)))) for (a, b) in census.true_test_links] export_result_prob(Census, 'LogisticRegression', 'census', 'logistic', entitiesA, result_prob, true_links, entitiesB) result = [(entitiesA.index( get_entity_name_loc(census, census.testDataA, int(a))), entitiesB.index( get_entity_name_loc(census, census.testDataB, int(b)))) for (a, b) in result] export_false_negatives(Census, 'LogisticRegression', 'census', 'logistic', entitiesA, result_prob, true_links, result, entitiesB) export_false_positives(Census, 'LogisticRegression', 'census', 'logistic', entitiesA, result_prob, true_links, result, entitiesB) weights = logrg.coefficients result = [ (e1, e2, [str("%.2f" % (float(d * w) / sum(weights))) for w in weights], d) for (e1, e2, d) in result_prob if (e1, e2) in result ] result_feature_mapping = [ (entitiesA.index( get_entity_name_loc(census, census.testDataA, int(a))), entitiesB.index( get_entity_name_loc(census, census.testDataB, int(b))), w, p) for (a, b, w, p) in result_feature_mapping ] export_human_readable_results(Census, 'LogisticRegression', 'census', 'logistic', entitiesA, result_feature_mapping, entitiesB)
def test_ecm(self): logger = get_logger('RL.Test.ECMClassifier.Census') census = Census() compare_cl = census.get_comparision_object() features = compare_cl.compute(census.candidate_links, census.trainDataA, census.trainDataB) logger.info("Train Features %s", str(features.describe())) # Train ECM Classifier logrg = recordlinkage.ECMClassifier() logrg.fit(features) result = logrg.predict(features) log_quality_results(logger, result, census.true_links, len(census.candidate_links)) #Validate the classifier compare_cl = census.get_comparision_object() features = compare_cl.compute(census.val_links, census.valDataA, census.valDataB) logger.info("Validation Features %s", str(features.describe())) result = logrg.predict(features) log_quality_results(logger, result, census.true_val_links, len(census.val_links)) #Test the classifier compare_cl = census.get_comparision_object() features = compare_cl.compute(census.test_links, census.testDataA, census.testDataB) logger.info("Test Features %s", str(features.describe())) result = logrg.predict(features) log_quality_results(logger, result, census.true_test_links, len(census.test_links)) logger.info("ECM weights: %s", str(logrg.weights)) #Log IR Stats: MRR, MAP, MP@K prob_series = logrg.prob(features) prob = [(1 - p) for p in prob_series.tolist()] result_prob = [(census.test_links[i][0], census.test_links[i][1], prob[i]) for i in range(0, len(prob))] ir_metrics = InformationRetrievalMetrics(result_prob, census.true_test_links) ir_metrics.log_metrics(logger) #Export False Positives and result porobabilities result_feature_mapping = [ (e1, e2, [str(v) for v in features.loc[(e1, e2)].values], d) for (e1, e2, d) in result_prob if (e1, e2) in result ] get_entity_name = lambda c, d, i: "_".join([ str(d.iloc[i][c.field_map[CensusFields.ID_INDIVIDUAL]]), str(d.iloc[i][c.field_map[CensusFields.DNI]]) ]) get_entity_name_loc = lambda c, d, i: "_".join([ str(d.loc[i][c.field_map[CensusFields.ID_INDIVIDUAL]]), str(d.loc[i][c.field_map[CensusFields.DNI]]) ]) start_time = timeit.default_timer() entitiesA = [ get_entity_name(census, census.testDataA, i) for i in range(int(census.testDataA.shape[0])) ] entitiesB = [ get_entity_name(census, census.testDataB, i) for i in range(int(census.testDataB.shape[0])) ] logger.info("Entities built in %s", str(timeit.default_timer() - start_time)) start_time = timeit.default_timer() result_prob = [(entitiesA.index( get_entity_name_loc(census, census.testDataA, int(a))), entitiesB.index( get_entity_name_loc(census, census.testDataB, int(b))), p) for (a, b, p) in result_prob] logger.info("Result prob in %s", str(timeit.default_timer() - start_time)) start_time = timeit.default_timer() true_links = [(entitiesA.index( get_entity_name_loc(census, census.testDataA, int(a))), entitiesB.index( get_entity_name_loc(census, census.testDataB, int(b)))) for (a, b) in census.true_test_links] logger.info("true_links in %s", str(timeit.default_timer() - start_time)) start_time = timeit.default_timer() export_result_prob(Census, 'ECM', 'census', 'ecm', entitiesA, result_prob, true_links, entitiesB) logger.info("Result prob EXPORTED in %s", str(timeit.default_timer() - start_time)) start_time = timeit.default_timer() result = [(entitiesA.index( get_entity_name_loc(census, census.testDataA, int(a))), entitiesB.index( get_entity_name_loc(census, census.testDataB, int(b)))) for (a, b) in result] export_false_negatives(Census, 'ECM', 'census', 'ecm', entitiesA, result_prob, true_links, result, entitiesB) export_false_positives(Census, 'ECM', 'census', 'ecm', entitiesA, result_prob, true_links, result, entitiesB) logger.info("FP & FN EXPORTED in %s", str(timeit.default_timer() - start_time)) result_feature_mapping = [ (entitiesA.index( get_entity_name_loc(census, census.testDataA, int(a))), entitiesB.index( get_entity_name_loc(census, census.testDataB, int(b))), w, p) for (a, b, w, p) in result_feature_mapping ] export_human_readable_results(Census, 'ECM', 'census', 'ecm', entitiesA, result_feature_mapping, entitiesB) logger.info("Exported Human Readable Results")