def _align_lr(self, model_path, source_kb, target_kb, candidate_selector): """ Align using logistic regression model :param source_kb: :param target_kb: :param candidate_selector: :return: """ alignment = [] feature_generator = EngineeredFeatureGenerator() sys.stdout.write("Loading model...\n") model = OntoEmmaLRModel() model.load(model_path) sys.stdout.write("Making predictions...\n") s_ent_tqdm = tqdm.tqdm(source_kb.entities, total=len(source_kb.entities)) for s_ent in s_ent_tqdm: s_ent_id = s_ent.research_entity_id for t_ent_id in candidate_selector.select_candidates( s_ent_id)[:constants.KEEP_TOP_K_CANDIDATES]: t_ent = target_kb.get_entity_by_research_entity_id(t_ent_id) features = [ feature_generator.calculate_features( source_kb.form_json_entity(s_ent), target_kb.form_json_entity(t_ent)) ] score = model.predict_entity_pair(features) if score[0][1] >= constants.LR_SCORE_THRESHOLD: alignment.append((s_ent_id, t_ent_id, score[0][1])) return alignment
def _evaluate_lr(self, model_path: str, evaluation_data_file: str): """ :param model_path: :param evaluation_data_file: :return: """ # load model from disk model = OntoEmmaLRModel() model.load(model_path) # load evaluation data eval_pairs, eval_labels = self._alignments_to_pairs_and_labels( evaluation_data_file) # initialize feature generator feat_gen = EngineeredFeatureGenerator() eval_features = [ feat_gen.calculate_features(s_ent, t_ent) for s_ent, t_ent in eval_pairs ] # compute metrics tp, fp, tn, fn = (0, 0, 0, 0) precision, recall, accuracy, f1_score = (0.0, 0.0, 0.0, 0.0) for features, label in zip(eval_features, eval_labels): prediction = model.predict_entity_pair(features) if prediction[0][1] > constants.SIM_SCORE_THRESHOLD and label == 1: tp += 1 elif prediction[0][ 1] > constants.SIM_SCORE_THRESHOLD and label == 0: fp += 1 elif prediction[0][ 0] > constants.SIM_SCORE_THRESHOLD and label == 1: fn += 1 else: tn += 1 if tp + fp > 0: precision = tp / (tp + fp) if tp + fn > 0: recall = tp / (tp + fn) if tp + fp + fn + tn > 0: accuracy = (tp + tn) / (tp + fp + fn + tn) if precision + recall > 0.0: f1_score = (2 * precision * recall / (precision + recall)) metrics = { 'precision': precision, 'recall': recall, 'accuracy': accuracy, 'f1_score': f1_score } return metrics
def _align_lr(self, model_path, source_kb, target_kb, candidate_selector): """ Align using logistic regression model :param source_kb: :param target_kb: :param candidate_selector: :return: """ sys.stdout.write("Loading model...\n") model = OntoEmmaLRModel() model.load(model_path) return self._apply_model_align(model, source_kb, target_kb, candidate_selector)
def _train_lr(self, model_path: str, config_file: str): """ Train a logistic regression model :param model_path: :param config_file: :return: """ model = OntoEmmaLRModel() # read model config with open(config_file, 'r') as f: config = json.load(f) # parse parameters training_data_path = config['train_data_path'] dev_data_path = config['validation_data_path'] # load training and dev data training_pairs, training_labels = self._alignments_to_pairs_and_labels( training_data_path) dev_pairs, dev_labels = self._alignments_to_pairs_and_labels( dev_data_path) sys.stdout.write('Training data size: %i\n' % len(training_labels)) sys.stdout.write('Development data size: %i\n' % len(dev_labels)) # generate features for training pairs feat_gen_train = FeatureGeneratorLR( [item for sublist in training_pairs for item in sublist]) training_features = [ feat_gen_train.calculate_features(s_ent['research_entity_id'], t_ent['research_entity_id']) for s_ent, t_ent in training_pairs ] # generate features for development pairs feat_gen_dev = FeatureGeneratorLR( [item for sublist in dev_pairs for item in sublist]) dev_features = [ feat_gen_dev.calculate_features(s_ent['research_entity_id'], t_ent['research_entity_id']) for s_ent, t_ent in dev_pairs ] model.train(training_features, training_labels) training_accuracy = model.score_accuracy(training_features, training_labels) sys.stdout.write("Accuracy on training data set: %.2f\n" % training_accuracy) dev_accuracy = model.score_accuracy(dev_features, dev_labels) sys.stdout.write("Accuracy on development data set: %.2f\n" % dev_accuracy) model.save(model_path) return
def _train_lr(self, model_path: str, config_file: str): """ Train a logistic regression model :param model_path: :param config_file: :return: """ model = OntoEmmaLRModel() self._apply_model_train(model, model_path, config_file) return
def _align_lr(self, model_path, source_kb, target_kb, candidate_selector): """ Align using logistic regression model :param source_kb: :param target_kb: :param candidate_selector: :return: """ # returns json representation of entity that matches what feature generator expects def _form_json_entity(ent, kb): parent_ids = [ kb.relations[rel_id].entity_ids[1] for rel_id in ent.relation_ids if kb.relations[rel_id].relation_type in constants.UMLS_PARENT_REL_LABELS ] child_ids = [ kb.relations[rel_id].entity_ids[1] for rel_id in ent.relation_ids if kb.relations[rel_id].relation_type in constants.UMLS_CHILD_REL_LABELS ] parents = [ kb.get_entity_by_research_entity_id(i).canonical_name for i in parent_ids if i in kb.research_entity_id_to_entity_index ] children = [ kb.get_entity_by_research_entity_id(i).canonical_name for i in child_ids if i in kb.research_entity_id_to_entity_index ] return { 'research_entity_id': ent.research_entity_id, 'canonical_name': ent.canonical_name, 'aliases': ent.aliases, 'definition': ent.definition, 'par_relations': parents, 'chd_relations': children } alignment = [] feature_generator = FeatureGeneratorLR( [_form_json_entity(ent, source_kb) for ent in source_kb.entities] + [_form_json_entity(ent, target_kb) for ent in target_kb.entities]) sys.stdout.write("Loading model...\n") model = OntoEmmaLRModel() model.load(model_path) sys.stdout.write("Making predictions...\n") s_ent_tqdm = tqdm.tqdm(source_kb.entities, total=len(source_kb.entities)) for s_ent in s_ent_tqdm: s_ent_id = s_ent.research_entity_id for t_ent_id in candidate_selector.select_candidates( s_ent_id)[:constants.KEEP_TOP_K_CANDIDATES]: features = [ feature_generator.calculate_features(s_ent_id, t_ent_id) ] score = model.predict_entity_pair(features) if score[0][1] >= constants.LR_SCORE_THRESHOLD: alignment.append((s_ent_id, t_ent_id, score[0][1])) return alignment