Пример #1
0
    def _align_lr(self, model_path, source_kb, target_kb, candidate_selector):
        """
        Align using logistic regression model
        :param source_kb:
        :param target_kb:
        :param candidate_selector:
        :return:
        """

        alignment = []

        feature_generator = EngineeredFeatureGenerator()

        sys.stdout.write("Loading model...\n")
        model = OntoEmmaLRModel()
        model.load(model_path)

        sys.stdout.write("Making predictions...\n")
        s_ent_tqdm = tqdm.tqdm(source_kb.entities,
                               total=len(source_kb.entities))
        for s_ent in s_ent_tqdm:
            s_ent_id = s_ent.research_entity_id
            for t_ent_id in candidate_selector.select_candidates(
                    s_ent_id)[:constants.KEEP_TOP_K_CANDIDATES]:
                t_ent = target_kb.get_entity_by_research_entity_id(t_ent_id)
                features = [
                    feature_generator.calculate_features(
                        source_kb.form_json_entity(s_ent),
                        target_kb.form_json_entity(t_ent))
                ]
                score = model.predict_entity_pair(features)
                if score[0][1] >= constants.LR_SCORE_THRESHOLD:
                    alignment.append((s_ent_id, t_ent_id, score[0][1]))

        return alignment
Пример #2
0
    def _evaluate_lr(self, model_path: str, evaluation_data_file: str):
        """

        :param model_path:
        :param evaluation_data_file:
        :return:
        """
        # load model from disk
        model = OntoEmmaLRModel()
        model.load(model_path)

        # load evaluation data
        eval_pairs, eval_labels = self._alignments_to_pairs_and_labels(
            evaluation_data_file)

        # initialize feature generator
        feat_gen = EngineeredFeatureGenerator()
        eval_features = [
            feat_gen.calculate_features(s_ent, t_ent)
            for s_ent, t_ent in eval_pairs
        ]

        # compute metrics
        tp, fp, tn, fn = (0, 0, 0, 0)
        precision, recall, accuracy, f1_score = (0.0, 0.0, 0.0, 0.0)

        for features, label in zip(eval_features, eval_labels):
            prediction = model.predict_entity_pair(features)
            if prediction[0][1] > constants.SIM_SCORE_THRESHOLD and label == 1:
                tp += 1
            elif prediction[0][
                    1] > constants.SIM_SCORE_THRESHOLD and label == 0:
                fp += 1
            elif prediction[0][
                    0] > constants.SIM_SCORE_THRESHOLD and label == 1:
                fn += 1
            else:
                tn += 1

        if tp + fp > 0:
            precision = tp / (tp + fp)
        if tp + fn > 0:
            recall = tp / (tp + fn)
        if tp + fp + fn + tn > 0:
            accuracy = (tp + tn) / (tp + fp + fn + tn)
        if precision + recall > 0.0:
            f1_score = (2 * precision * recall / (precision + recall))

        metrics = {
            'precision': precision,
            'recall': recall,
            'accuracy': accuracy,
            'f1_score': f1_score
        }
        return metrics
Пример #3
0
 def _align_lr(self, model_path, source_kb, target_kb, candidate_selector):
     """
     Align using logistic regression model
     :param source_kb:
     :param target_kb:
     :param candidate_selector:
     :return:
     """
     sys.stdout.write("Loading model...\n")
     model = OntoEmmaLRModel()
     model.load(model_path)
     return self._apply_model_align(model, source_kb, target_kb, candidate_selector)
Пример #4
0
    def _train_lr(self, model_path: str, config_file: str):
        """
        Train a logistic regression model
        :param model_path:
        :param config_file:
        :return:
        """
        model = OntoEmmaLRModel()

        # read model config
        with open(config_file, 'r') as f:
            config = json.load(f)

        # parse parameters
        training_data_path = config['train_data_path']
        dev_data_path = config['validation_data_path']

        # load training and dev data
        training_pairs, training_labels = self._alignments_to_pairs_and_labels(
            training_data_path)
        dev_pairs, dev_labels = self._alignments_to_pairs_and_labels(
            dev_data_path)

        sys.stdout.write('Training data size: %i\n' % len(training_labels))
        sys.stdout.write('Development data size: %i\n' % len(dev_labels))

        # generate features for training pairs
        feat_gen_train = FeatureGeneratorLR(
            [item for sublist in training_pairs for item in sublist])
        training_features = [
            feat_gen_train.calculate_features(s_ent['research_entity_id'],
                                              t_ent['research_entity_id'])
            for s_ent, t_ent in training_pairs
        ]

        # generate features for development pairs
        feat_gen_dev = FeatureGeneratorLR(
            [item for sublist in dev_pairs for item in sublist])
        dev_features = [
            feat_gen_dev.calculate_features(s_ent['research_entity_id'],
                                            t_ent['research_entity_id'])
            for s_ent, t_ent in dev_pairs
        ]

        model.train(training_features, training_labels)

        training_accuracy = model.score_accuracy(training_features,
                                                 training_labels)
        sys.stdout.write("Accuracy on training data set: %.2f\n" %
                         training_accuracy)

        dev_accuracy = model.score_accuracy(dev_features, dev_labels)
        sys.stdout.write("Accuracy on development data set: %.2f\n" %
                         dev_accuracy)

        model.save(model_path)
        return
Пример #5
0
 def _train_lr(self, model_path: str, config_file: str):
     """
     Train a logistic regression model
     :param model_path:
     :param config_file:
     :return:
     """
     model = OntoEmmaLRModel()
     self._apply_model_train(model, model_path, config_file)
     return
Пример #6
0
    def _align_lr(self, model_path, source_kb, target_kb, candidate_selector):
        """
        Align using logistic regression model
        :param source_kb:
        :param target_kb:
        :param candidate_selector:
        :return:
        """

        # returns json representation of entity that matches what feature generator expects
        def _form_json_entity(ent, kb):
            parent_ids = [
                kb.relations[rel_id].entity_ids[1]
                for rel_id in ent.relation_ids
                if kb.relations[rel_id].relation_type in
                constants.UMLS_PARENT_REL_LABELS
            ]

            child_ids = [
                kb.relations[rel_id].entity_ids[1]
                for rel_id in ent.relation_ids
                if kb.relations[rel_id].relation_type in
                constants.UMLS_CHILD_REL_LABELS
            ]

            parents = [
                kb.get_entity_by_research_entity_id(i).canonical_name
                for i in parent_ids
                if i in kb.research_entity_id_to_entity_index
            ]

            children = [
                kb.get_entity_by_research_entity_id(i).canonical_name
                for i in child_ids
                if i in kb.research_entity_id_to_entity_index
            ]

            return {
                'research_entity_id': ent.research_entity_id,
                'canonical_name': ent.canonical_name,
                'aliases': ent.aliases,
                'definition': ent.definition,
                'par_relations': parents,
                'chd_relations': children
            }

        alignment = []

        feature_generator = FeatureGeneratorLR(
            [_form_json_entity(ent, source_kb) for ent in source_kb.entities] +
            [_form_json_entity(ent, target_kb) for ent in target_kb.entities])

        sys.stdout.write("Loading model...\n")
        model = OntoEmmaLRModel()
        model.load(model_path)

        sys.stdout.write("Making predictions...\n")
        s_ent_tqdm = tqdm.tqdm(source_kb.entities,
                               total=len(source_kb.entities))
        for s_ent in s_ent_tqdm:
            s_ent_id = s_ent.research_entity_id
            for t_ent_id in candidate_selector.select_candidates(
                    s_ent_id)[:constants.KEEP_TOP_K_CANDIDATES]:
                features = [
                    feature_generator.calculate_features(s_ent_id, t_ent_id)
                ]
                score = model.predict_entity_pair(features)
                if score[0][1] >= constants.LR_SCORE_THRESHOLD:
                    alignment.append((s_ent_id, t_ent_id, score[0][1]))

        return alignment