示例#1
0
    def test_modh_match(self):
        source_ont_file = os.path.join(TEST_DATA, 'test_source_ont.json')
        target_ont_file = os.path.join(TEST_DATA, 'test_target_ont.json')

        ontoemma = OntoEmma()

        s_kb = ontoemma.load_kb(source_ont_file)
        s_kb.normalize_kb()

        t_kb = ontoemma.load_kb(target_ont_file)
        t_kb.normalize_kb()

        alignment = self.ontoemma._apply_modh_alignment_strategy(self.sim_scores, s_kb, t_kb)
        assert len(alignment) == 6
示例#2
0
 def query_all(self):
     """
     Query all entities in input data file
     :return:
     """
     try:
         ontoemma = OntoEmma()
         kb = ontoemma.load_kb(self.data_path)
         self.query_all_kb(kb)
     except Exception:
         try:
             self.query_all_training_data()
         except Exception:
             raise NotImplementedError(
                 "Unknown file type, cannot enrich...")
示例#3
0
class TestAssignmentStrategies(unittest.TestCase):

    sim_scores_fpath = os.path.join(TEST_DATA, 'test_sim_scores.pickle')
    sim_scores = pickle.load(open(sim_scores_fpath, 'rb'))
    ontoemma = OntoEmma()

    def test_best_match(self):
        alignment = self.ontoemma._apply_best_alignment_strategy(self.sim_scores)
        assert len(alignment) == 6

    def test_all_match(self):
        alignment = self.ontoemma._apply_all_alignment_strategy(self.sim_scores)
        assert len(alignment) == 6

    def test_modh_match(self):
        source_ont_file = os.path.join(TEST_DATA, 'test_source_ont.json')
        target_ont_file = os.path.join(TEST_DATA, 'test_target_ont.json')

        ontoemma = OntoEmma()

        s_kb = ontoemma.load_kb(source_ont_file)
        s_kb.normalize_kb()

        t_kb = ontoemma.load_kb(target_ont_file)
        t_kb.normalize_kb()

        alignment = self.ontoemma._apply_modh_alignment_strategy(self.sim_scores, s_kb, t_kb)
        assert len(alignment) == 6
示例#4
0
class TestNeighborhoodSimilarity(unittest.TestCase):

    source_ont_file = os.path.join(TEST_DATA, 'test_source_ont.json')
    target_ont_file = os.path.join(TEST_DATA, 'test_target_ont.json')

    ontoemma = OntoEmma()

    s_kb = ontoemma.load_kb(source_ont_file)
    s_kb.normalize_kb()

    t_kb = ontoemma.load_kb(target_ont_file)
    t_kb.normalize_kb()

    sim_scores_fpath = os.path.join(TEST_DATA, 'test_sim_scores.pickle')
    sim_scores = pickle.load(open(sim_scores_fpath, 'rb'))

    def test_neighbor_similarity_null(self):
        neighborhood_sim_null = self.ontoemma._compute_neighborhood_similarities(
            self.sim_scores, self.s_kb, self.t_kb, 0)
        assert neighborhood_sim_null == self.sim_scores

    def test_neighborhood_similarity_oneiter(self):
        neighborhood_sim_one = self.ontoemma._compute_neighborhood_similarities(
            self.sim_scores, self.s_kb, self.t_kb, 1)
        assert len(neighborhood_sim_one) == len(self.sim_scores)
        assert neighborhood_sim_one != self.sim_scores

    def test_neighborhood_similarity_fiveiter(self):
        neighborhood_sim_five = self.ontoemma._compute_neighborhood_similarities(
            self.sim_scores, self.s_kb, self.t_kb, 5)
        assert len(neighborhood_sim_five) == len(self.sim_scores)
        assert neighborhood_sim_five != self.sim_scores
示例#5
0
    def test_lr(self):
        model_path = os.path.join(TEST_DATA, 'test_lr_model.pickle')
        source_ont_file = os.path.join(TEST_DATA, 'test_source_ont.json')
        target_ont_file = os.path.join(TEST_DATA, 'test_target_ont.json')
        input_alignment_file = os.path.join(TEST_DATA,
                                            'test_input_alignment.tsv')
        output_alignment_file = os.path.join(TEST_DATA,
                                             'test_output_alignment.tsv')

        matcher = OntoEmma()
        p, r, f1 = matcher.align('lr', model_path, source_ont_file,
                                 target_ont_file, input_alignment_file,
                                 output_alignment_file, -1)

        assert p >= 0.8
        assert r >= 0.6
        assert f1 >= 0.7
示例#6
0
    def test_nn(self):
        config_file = os.path.join(TEST_DATA, 'test_nn_config_file.json')
        model_path = os.path.join(TEST_DATA, 'test_nn_model')

        if os.path.exists(model_path):
            shutil.rmtree(model_path)

        matcher = OntoEmma()
        matcher.train('nn', model_path, config_file)

        assert (os.path.exists(os.path.join(model_path, 'model.tar.gz')))

        model_path = os.path.join(TEST_DATA, 'test_nn_model', 'model.tar.gz')
        source_ont_file = os.path.join(TEST_DATA, 'test_source_ont.json')
        target_ont_file = os.path.join(TEST_DATA, 'test_target_ont.json')
        input_alignment_file = os.path.join(TEST_DATA,
                                            'test_input_alignment.tsv')
        output_alignment_file = os.path.join(TEST_DATA,
                                             'test_output_alignment.tsv')

        matcher = OntoEmma()
        p, r, f1 = matcher.align('nn', model_path, source_ont_file,
                                 target_ont_file, input_alignment_file,
                                 output_alignment_file, -1)
        assert p >= 0.0
        assert r >= 0.0
        assert f1 >= 0.0
示例#7
0
    def test_lr(self):
        config_file = os.path.join(TEST_DATA, 'test_lr_config_file.json')
        model_path = os.path.join(TEST_DATA, 'test_lr_model.pickle')

        if os.path.exists(model_path):
            os.remove(model_path)

        matcher = OntoEmma()
        matcher.train('lr', model_path, config_file)

        assert (os.path.exists(model_path))

        source_ont_file = os.path.join(TEST_DATA, 'test_source_ont.json')
        target_ont_file = os.path.join(TEST_DATA, 'test_target_ont.json')
        input_alignment_file = os.path.join(TEST_DATA,
                                            'test_input_alignment.tsv')
        output_alignment_file = os.path.join(TEST_DATA,
                                             'test_output_alignment.tsv')

        matcher = OntoEmma()
        p, r, f1 = matcher.align('lr', model_path, source_ont_file,
                                 target_ont_file, input_alignment_file,
                                 output_alignment_file, "best", -1)

        assert p >= 0.8
        assert r >= 0.6
        assert f1 >= 0.7
    def split_training_data(self):
        """
        Process and split data into training development and test sets
        :return:
        """
        all_kb_names = constants.TRAINING_KBS + constants.DEVELOPMENT_KBS
        training_file_dir = os.path.join(self.OUTPUT_DIR, 'training')

        output_training_data = os.path.join(self.TRAINING_DIR,
                                            'ontoemma.context.train')
        output_development_data = os.path.join(self.TRAINING_DIR,
                                               'ontoemma.context.dev')
        output_test_data = os.path.join(self.TRAINING_DIR,
                                        'ontoemma.context.test')

        context_files = glob.glob(
            os.path.join(self.OUTPUT_KB_DIR, '*context.json'))
        context_kbs = [
            os.path.basename(f).split('-')[1] for f in context_files
        ]
        training_files = glob.glob(os.path.join(training_file_dir, '*.tsv'))
        file_names = [
            os.path.splitext(os.path.basename(f))[0] for f in training_files
        ]

        training_labels = []
        training_dat = []

        emma = OntoEmma()

        for fname, fpath in zip(file_names, training_files):
            (kb1_name, kb2_name) = fname.split('-')
            if kb1_name in all_kb_names and kb2_name in all_kb_names \
                    and kb1_name in context_kbs and kb2_name in context_kbs:
                sys.stdout.write("Processing %s and %s\n" %
                                 (kb1_name, kb2_name))
                kb1 = emma.load_kb(
                    os.path.join(self.OUTPUT_KB_DIR,
                                 'kb-{}-context.json'.format(kb1_name)))
                kb2 = emma.load_kb(
                    os.path.join(self.OUTPUT_KB_DIR,
                                 'kb-{}-context.json'.format(kb2_name)))
                alignment = emma.load_alignment(fpath)

                for (e1, e2, score) in alignment:
                    kb1_ent = kb1.get_entity_by_research_entity_id(e1)
                    kb2_ent = kb2.get_entity_by_research_entity_id(e2)
                    training_labels.append(int(score))
                    training_dat.append({
                        "source_entity":
                        self._kb_entity_to_training_json(kb1_ent, kb1),
                        "target_entity":
                        self._kb_entity_to_training_json(kb2_ent, kb2)
                    })
            else:
                sys.stdout.write("Skipping %s and %s\n" % (kb1_name, kb2_name))

        training_dat, test_dat, training_labels, test_labels = train_test_split(
            training_dat,
            training_labels,
            stratify=training_labels,
            test_size=constants.TEST_PART)

        training_dat, development_dat, training_labels, development_labels = train_test_split(
            training_dat,
            training_labels,
            stratify=training_labels,
            test_size=constants.DEVELOPMENT_PART)

        training_labels = self._replace_negative_labels(training_labels)
        development_labels = self._replace_negative_labels(development_labels)
        test_labels = self._replace_negative_labels(test_labels)

        with jsonlines.open(output_training_data, mode='w') as writer:
            for label, dat in zip(training_labels, training_dat):
                writer.write({
                    "label": label,
                    "source_ent": dat["source_entity"],
                    "target_ent": dat["target_entity"]
                })

        with jsonlines.open(output_development_data, mode='w') as writer:
            for label, dat in zip(development_labels, development_dat):
                writer.write({
                    "label": label,
                    "source_ent": dat["source_entity"],
                    "target_ent": dat["target_entity"]
                })

        with jsonlines.open(output_test_data, mode='w') as writer:
            for label, dat in zip(test_labels, test_dat):
                writer.write({
                    "label": label,
                    "source_ent": dat["source_entity"],
                    "target_ent": dat["target_entity"]
                })
        return
示例#9
0
def main(argv):
    model_path = None
    model_type = "nn"
    source_ont_file = None
    target_ont_file = None
    input_alignment_file = None
    output_alignment_file = None
    align_strat = "best"
    cuda_device = -1

    sys.stdout.write('\n')
    sys.stdout.write('-------------------------\n')
    sys.stdout.write('OntoEMMA version 0.1     \n')
    sys.stdout.write('-------------------------\n')
    sys.stdout.write('https://github.com/allenai/ontoemma\n')
    sys.stdout.write('An ML-based ontology matcher to produce entity alignments between knowledgebases\n')
    sys.stdout.write('\n')

    try:
        nltk.data.find("corpora/stopwords")
    except LookupError:
        try:
            _create_unverified_https_context = ssl._create_unverified_context
        except AttributeError:
            pass
        else:
            ssl._create_default_https_context = _create_unverified_https_context
        nltk.download("stopwords")

    try:
        # TODO(waleeda): use argparse instead of getopt to parse command line arguments.
        opts, args = getopt.getopt(
            argv, "hs:t:i:o:m:p:g:a:", ["source=", "target=", "input=", "output=", "model_path=", "model_type=", "cuda_device=", "alignment_strategy="]
        )
    except getopt.GetoptError:
        sys.stdout.write('Unknown option... -h or --help for help.\n')
        sys.exit(1)

    for opt, arg in opts:
        if opt in ("-h", "--help"):
            sys.stdout.write('Options: \n')
            sys.stdout.write('-s <source_ontology_file>\n')
            sys.stdout.write('-t <target_ontology_file>\n')
            sys.stdout.write('-i <input_alignment_file>\n')
            sys.stdout.write('-o <output_alignment_file>\n')
            sys.stdout.write('-m <model_location>\n')
            sys.stdout.write('-p <model_type>')
            sys.stdout.write('-g <cuda_device>')
            sys.stdout.write('-a <alignment_strategy>')
            sys.stdout.write('Example usage: \n')
            sys.stdout.write(
                '  ./run_ontoemma.py -s source_ont.json -t target_ont.json -i gold_alignment.tsv -o generated_alignment.tsv -m model_serialization_dir -p nn\n'
            )
            sys.stdout.write('-------------------------\n')
            sys.stdout.write('Accepted KB file formats: json, pickle, owl\n')
            sys.stdout.write('Accepted alignment file formats: rdf, tsv\n')
            sys.stdout.write('Accepted model types (defaults to nn):\n')
            sys.stdout.write('\tnn (neural network)\n')
            sys.stdout.write('\tlr (logistic regression)\n')
            sys.stdout.write('Accepted alignment strategies (defaults to best):\n')
            sys.stdout.write('\tbest (best match per entity above threshold)\n')
            sys.stdout.write('\tall (all matches per entity above threshold)\n')
            sys.stdout.write('\tmodh (modified hungarian algorithm for assignment)\n')
            sys.stdout.write('Pretrained models can be found at:\n')
            sys.stdout.write('  /net/nfs.corp/s2-research/scigraph/ontoemma/')
            sys.stdout.write('-------------------------\n')
            sys.stdout.write('\n')
            sys.exit(0)
        elif opt in ("-s", "--source"):
            source_ont_file = os.path.abspath(arg)
            sys.stdout.write('Source ontology file is %s\n' % source_ont_file)
        elif opt in ("-t", "--target"):
            target_ont_file = os.path.abspath(arg)
            sys.stdout.write('Target ontology file is %s\n' % target_ont_file)
        elif opt in ("-i", "--input"):
            input_alignment_file = os.path.abspath(arg)
            sys.stdout.write(
                'Input alignment file is %s\n' % input_alignment_file
            )
        elif opt in ("-o", "--output"):
            output_alignment_file = os.path.abspath(arg)
            sys.stdout.write(
                'Output alignment file is %s\n' % output_alignment_file
            )
        elif opt in ("-m", "--model"):
            model_path = os.path.abspath(arg)
        elif opt in ("-p", "--model-type"):
            if arg in emma.constants.IMPLEMENTED_MODEL_TYPES:
                model_type = arg
                sys.stdout.write(
                    'Model type is %s\n' % emma.constants.IMPLEMENTED_MODEL_TYPES[model_type]
                )
            else:
                sys.stdout.write('Error: Unknown model type...\n')
                sys.exit(1)
        elif opt in ("-a", "--alignment_method"):
            if arg in emma.constants.IMPLEMENTED_ALIGNMENT_STRATEGY:
                align_strat = arg
                sys.stdout.write(
                    'Alignment selection strategy is %s\n' % arg.upper()
                )
            else:
                sys.stdout.write('Error: Unknown alignment selection strategy')
        elif opt in ("-g", "--cuda_device"):
            cuda_device = int(arg)
            sys.stdout.write(
                'Using CUDA device %i\n' % cuda_device
            )

    sys.stdout.write('\n')

    if source_ont_file is not None and target_ont_file is not None:
        matcher = OntoEmma()
        matcher.align(
            model_type, model_path,
            source_ont_file, target_ont_file,
            input_alignment_file, output_alignment_file,
            align_strat, cuda_device
        )
示例#10
0
def main(argv):
    model_path = None
    model_type = "nn"
    config_file = None
    evaluate_flag = False
    evaluation_data_file = None
    cuda_device = -1

    try:
        nltk.data.find("corpora/stopwords")
    except LookupError:
        try:
            _create_unverified_https_context = ssl._create_unverified_context
        except AttributeError:
            pass
        else:
            ssl._create_default_https_context = _create_unverified_https_context
        nltk.download("stopwords")
        nltk.download("wordnet")

    try:
        # TODO(waleeda): use argparse instead of getopt to parse command line arguments.
        opts, args = getopt.getopt(argv, "hec:m:p:d:g:", [
            "config=", "model_path=", "model_type=", "evaluation_data_file=",
            "cuda_device="
        ])
    except getopt.GetoptError:
        sys.stdout.write('Unknown option... -h or --help for help.\n')
        sys.exit(1)

    for opt, arg in opts:
        if opt in ("-h", "--help"):
            sys.stdout.write('Options: \n')
            sys.stdout.write('-c <configuration_file>\n')
            sys.stdout.write('-m <model_location>\n')
            sys.stdout.write('-p <model_type>\n')
            sys.stdout.write('-e # evaluation mode\n')
            sys.stdout.write('-d <evaluation_data_file>\n')
            sys.stdout.write('-g <cuda_device>\n\n')
            sys.stdout.write('Example usages: \n')
            sys.stdout.write(
                '  ./train_ontoemma.py -c configuration_file.json -m model_file_path -p nn\n'
            )
            sys.stdout.write(
                '  ./train_ontoemma.py -e -m model_file_path -d evaluation_data_path -g 5\n'
            )
            sys.stdout.write('-------------------------\n')
            sys.stdout.write(
                'Accepted model types: nn (neural network), lr (logistic regression)\n'
            )
            sys.stdout.write('-------------------------\n')
            sys.stdout.write('\n')
            sys.exit(0)
        elif opt in ("-e", "--evaluate"):
            evaluate_flag = True
            sys.stdout.write('Evaluation mode\n')
        elif opt in ("-c", "--config"):
            config_file = os.path.abspath(arg)
            sys.stdout.write('Configuration file is %s\n' % config_file)
        elif opt in ("-m", "--model"):
            model_path = os.path.abspath(arg)
            sys.stdout.write('Model output path is %s\n' % model_path)
        elif opt in ("-p", "--model-type"):
            if arg in emma.constants.IMPLEMENTED_MODEL_TYPES:
                model_type = arg
                sys.stdout.write(
                    'Model type is %s\n' %
                    emma.constants.IMPLEMENTED_MODEL_TYPES[model_type])
            else:
                sys.stdout.write('Error: Unknown model type...\n')
                sys.exit(1)
        elif opt in ("-d", "--eval-data-file"):
            evaluation_data_file = os.path.abspath(arg)
            sys.stdout.write('Evaluation data file is %s\n' %
                             evaluation_data_file)
        elif opt in ("-g", "--cuda-device"):
            cuda_device = int(arg)
            sys.stdout.write('Using CUDA device %i\n' % cuda_device)

    sys.stdout.write('\n')

    matcher = OntoEmma()
    if evaluate_flag:
        matcher.evaluate(model_type, model_path, evaluation_data_file,
                         cuda_device)
    else:
        matcher.train(model_type, model_path, config_file)