def sample_negative_mappings(self, kb1, kb2, tp_mappings):
        """
        Given two KBs and true positive mapping, sample easy and hard negatives
        for training data
        :param kb1: source KB
        :param kb2: target KB
        :param tp_mappings: true positive mappings
        :return: negative pairs (0 for hard negatives, -1 for easy negatives)
        """
        cand_sel = CandidateSelection(kb1, kb2)

        sys.stdout.write('\t\tExtracting candidates...\n')
        kb2_ent_ids = [e.research_entity_id for e in kb2.entities]
        tps = set([tuple(i[:2]) for i in tp_mappings])

        cand_negs = []
        rand_negs = []

        # sample negatives for each true positive (TP)
        for tp in tps:
            # get candidates for source entity
            cands = cand_sel.select_candidates(
                tp[0])[:constants.KEEP_TOP_K_CANDIDATES]
            # sample hard negatives
            cand = random.sample(
                cands, min(constants.NUM_HARD_NEGATIVE_PER_POSITIVE,
                           len(cands)))
            cand_negs += [tuple([tp[0], c]) for c in cand]
            # sample easy negatives
            rand = random.sample(kb2_ent_ids,
                                 constants.NUM_EASY_NEGATIVE_PER_POSITIVE)
            rand_negs += [tuple([tp[0], r]) for r in rand]

        # filter negatives
        hard_negatives = set(cand_negs).difference(tps)
        easy_negatives = set(rand_negs).difference(tps).difference(
            hard_negatives)

        # append negative pairs together with labels: (0 = hard negative, -1 = easy negative)
        neg_pairs = []
        for neg in hard_negatives:
            neg_pairs.append([neg[0], neg[1], 0, self.umls_header])
        for neg in easy_negatives:
            neg_pairs.append([neg[0], neg[1], -1, self.umls_header])

        return neg_pairs
Exemplo n.º 2
0
    def align(self,
              model_type, model_path,
              s_kb_path, t_kb_path,
              gold_path, output_path,
              align_strat, cuda_device=-1,
              missed_path=None):
        """
        Align two input ontologies
        :param model_type: type of model
        :param model_path: path to ontoemma model
        :param s_kb_path: path to source KB
        :param t_kb_path: path to target KB
        :param gold_path: path to gold alignment between source and target KBs
        :param output_path: path to write output alignment
        :param align_strat: strategy for alignment assignment
        :param cuda_device: GPU device number
        :param missed_path: optional parameter for outputting missed alignments
        :return:
        """
        assert model_type in constants.IMPLEMENTED_MODEL_TYPES
        assert os.path.exists(model_path)
        assert s_kb_path is not None
        assert t_kb_path is not None

        alignment_scores = None

        sys.stdout.write("Loading KBs...\n")
        s_kb = self.load_kb(s_kb_path)
        t_kb = self.load_kb(t_kb_path)

        sys.stdout.write("Normalizing KBs...\n")
        s_kb.normalize_kb()
        t_kb.normalize_kb()

        sys.stdout.write("Building candidate indices...\n")
        cand_sel = CandidateSelection(s_kb, t_kb)

        similarity_scores = []
        if model_type == 'lr':
            similarity_scores = self._align_lr(model_path, s_kb, t_kb, cand_sel)
        elif model_type == 'nn':
            similarity_scores = self._align_nn(model_path, s_kb, t_kb, cand_sel, cuda_device)

        neighborhood_scores = self._compute_neighborhood_similarities(similarity_scores, s_kb, t_kb)
        alignment = self._compute_alignment(align_strat, similarity_scores, neighborhood_scores, s_kb, t_kb)

        if missed_path is None and output_path is not None:
            missed_path = output_path + '.ontoemma.missed'

        if gold_path is not None and os.path.exists(gold_path):
            sys.stdout.write("Evaluating against gold standard...\n")
            alignment_scores = self.compare_alignment_to_gold(gold_path, alignment, s_kb, t_kb, missed_path)

        if output_path is not None:
            sys.stdout.write("Writing results to file...\n")
            self.write_alignment(output_path, alignment, s_kb_path, t_kb_path)

        return alignment_scores
Exemplo n.º 3
0
        def eval_cs(self, s_kb_path, t_kb_path, gold_path, output_path,
                    missed_path):
            """
            Evaluate candidate selection module
            :param s_kb_path: source kb path
            :param t_kb_path: target kb path
            :param gold_path: gold alignment file path
            :param output_path: output path for evaluation results
            :param missed_path: output path for missed alignments
            :return:
            """
            sys.stdout.write("Loading KBs...\n")
            s_kb = self.load_kb(s_kb_path)
            t_kb = self.load_kb(t_kb_path)

            sys.stdout.write("Loading gold alignment...\n")
            gold_alignment = self.load_alignment(gold_path)
            positive_alignments = [(i[0], i[1]) for i in gold_alignment]
            sys.stdout.write("\tNumber of gold alignments: %i\n" %
                             len(positive_alignments))

            sys.stdout.write("Starting candidate selection...\n")
            cand_sel = CandidateSelection(s_kb, t_kb)
            cand_sel.EVAL_OUTPUT_FILE = output_path
            cand_sel.EVAL_MISSED_FILE = missed_path

            sys.stdout.write("Evaluating candidate selection...\n")
            cand_sel.eval(positive_alignments)
            return