예제 #1
0
    def test_nn(self):
        config_file = os.path.join(TEST_DATA, 'test_nn_config_file.json')
        model_path = os.path.join(TEST_DATA, 'test_nn_model')

        if os.path.exists(model_path):
            shutil.rmtree(model_path)

        matcher = OntoEmma()
        matcher.train('nn', model_path, config_file)

        assert (os.path.exists(os.path.join(model_path, 'model.tar.gz')))

        model_path = os.path.join(TEST_DATA, 'test_nn_model', 'model.tar.gz')
        source_ont_file = os.path.join(TEST_DATA, 'test_source_ont.json')
        target_ont_file = os.path.join(TEST_DATA, 'test_target_ont.json')
        input_alignment_file = os.path.join(TEST_DATA,
                                            'test_input_alignment.tsv')
        output_alignment_file = os.path.join(TEST_DATA,
                                             'test_output_alignment.tsv')

        matcher = OntoEmma()
        p, r, f1 = matcher.align('nn', model_path, source_ont_file,
                                 target_ont_file, input_alignment_file,
                                 output_alignment_file, -1)
        assert p >= 0.0
        assert r >= 0.0
        assert f1 >= 0.0
예제 #2
0
    def test_lr(self):
        config_file = os.path.join(TEST_DATA, 'test_lr_config_file.json')
        model_path = os.path.join(TEST_DATA, 'test_lr_model.pickle')

        if os.path.exists(model_path):
            os.remove(model_path)

        matcher = OntoEmma()
        matcher.train('lr', model_path, config_file)

        assert (os.path.exists(model_path))

        source_ont_file = os.path.join(TEST_DATA, 'test_source_ont.json')
        target_ont_file = os.path.join(TEST_DATA, 'test_target_ont.json')
        input_alignment_file = os.path.join(TEST_DATA,
                                            'test_input_alignment.tsv')
        output_alignment_file = os.path.join(TEST_DATA,
                                             'test_output_alignment.tsv')

        matcher = OntoEmma()
        p, r, f1 = matcher.align('lr', model_path, source_ont_file,
                                 target_ont_file, input_alignment_file,
                                 output_alignment_file, "best", -1)

        assert p >= 0.8
        assert r >= 0.6
        assert f1 >= 0.7
예제 #3
0
    def test_lr(self):
        model_path = os.path.join(TEST_DATA, 'test_lr_model.pickle')
        source_ont_file = os.path.join(TEST_DATA, 'test_source_ont.json')
        target_ont_file = os.path.join(TEST_DATA, 'test_target_ont.json')
        input_alignment_file = os.path.join(TEST_DATA,
                                            'test_input_alignment.tsv')
        output_alignment_file = os.path.join(TEST_DATA,
                                             'test_output_alignment.tsv')

        matcher = OntoEmma()
        p, r, f1 = matcher.align('lr', model_path, source_ont_file,
                                 target_ont_file, input_alignment_file,
                                 output_alignment_file, -1)

        assert p >= 0.8
        assert r >= 0.6
        assert f1 >= 0.7
예제 #4
0
def main(argv):
    model_path = None
    model_type = "nn"
    source_ont_file = None
    target_ont_file = None
    input_alignment_file = None
    output_alignment_file = None
    align_strat = "best"
    cuda_device = -1

    sys.stdout.write('\n')
    sys.stdout.write('-------------------------\n')
    sys.stdout.write('OntoEMMA version 0.1     \n')
    sys.stdout.write('-------------------------\n')
    sys.stdout.write('https://github.com/allenai/ontoemma\n')
    sys.stdout.write('An ML-based ontology matcher to produce entity alignments between knowledgebases\n')
    sys.stdout.write('\n')

    try:
        nltk.data.find("corpora/stopwords")
    except LookupError:
        try:
            _create_unverified_https_context = ssl._create_unverified_context
        except AttributeError:
            pass
        else:
            ssl._create_default_https_context = _create_unverified_https_context
        nltk.download("stopwords")

    try:
        # TODO(waleeda): use argparse instead of getopt to parse command line arguments.
        opts, args = getopt.getopt(
            argv, "hs:t:i:o:m:p:g:a:", ["source=", "target=", "input=", "output=", "model_path=", "model_type=", "cuda_device=", "alignment_strategy="]
        )
    except getopt.GetoptError:
        sys.stdout.write('Unknown option... -h or --help for help.\n')
        sys.exit(1)

    for opt, arg in opts:
        if opt in ("-h", "--help"):
            sys.stdout.write('Options: \n')
            sys.stdout.write('-s <source_ontology_file>\n')
            sys.stdout.write('-t <target_ontology_file>\n')
            sys.stdout.write('-i <input_alignment_file>\n')
            sys.stdout.write('-o <output_alignment_file>\n')
            sys.stdout.write('-m <model_location>\n')
            sys.stdout.write('-p <model_type>')
            sys.stdout.write('-g <cuda_device>')
            sys.stdout.write('-a <alignment_strategy>')
            sys.stdout.write('Example usage: \n')
            sys.stdout.write(
                '  ./run_ontoemma.py -s source_ont.json -t target_ont.json -i gold_alignment.tsv -o generated_alignment.tsv -m model_serialization_dir -p nn\n'
            )
            sys.stdout.write('-------------------------\n')
            sys.stdout.write('Accepted KB file formats: json, pickle, owl\n')
            sys.stdout.write('Accepted alignment file formats: rdf, tsv\n')
            sys.stdout.write('Accepted model types (defaults to nn):\n')
            sys.stdout.write('\tnn (neural network)\n')
            sys.stdout.write('\tlr (logistic regression)\n')
            sys.stdout.write('Accepted alignment strategies (defaults to best):\n')
            sys.stdout.write('\tbest (best match per entity above threshold)\n')
            sys.stdout.write('\tall (all matches per entity above threshold)\n')
            sys.stdout.write('\tmodh (modified hungarian algorithm for assignment)\n')
            sys.stdout.write('Pretrained models can be found at:\n')
            sys.stdout.write('  /net/nfs.corp/s2-research/scigraph/ontoemma/')
            sys.stdout.write('-------------------------\n')
            sys.stdout.write('\n')
            sys.exit(0)
        elif opt in ("-s", "--source"):
            source_ont_file = os.path.abspath(arg)
            sys.stdout.write('Source ontology file is %s\n' % source_ont_file)
        elif opt in ("-t", "--target"):
            target_ont_file = os.path.abspath(arg)
            sys.stdout.write('Target ontology file is %s\n' % target_ont_file)
        elif opt in ("-i", "--input"):
            input_alignment_file = os.path.abspath(arg)
            sys.stdout.write(
                'Input alignment file is %s\n' % input_alignment_file
            )
        elif opt in ("-o", "--output"):
            output_alignment_file = os.path.abspath(arg)
            sys.stdout.write(
                'Output alignment file is %s\n' % output_alignment_file
            )
        elif opt in ("-m", "--model"):
            model_path = os.path.abspath(arg)
        elif opt in ("-p", "--model-type"):
            if arg in emma.constants.IMPLEMENTED_MODEL_TYPES:
                model_type = arg
                sys.stdout.write(
                    'Model type is %s\n' % emma.constants.IMPLEMENTED_MODEL_TYPES[model_type]
                )
            else:
                sys.stdout.write('Error: Unknown model type...\n')
                sys.exit(1)
        elif opt in ("-a", "--alignment_method"):
            if arg in emma.constants.IMPLEMENTED_ALIGNMENT_STRATEGY:
                align_strat = arg
                sys.stdout.write(
                    'Alignment selection strategy is %s\n' % arg.upper()
                )
            else:
                sys.stdout.write('Error: Unknown alignment selection strategy')
        elif opt in ("-g", "--cuda_device"):
            cuda_device = int(arg)
            sys.stdout.write(
                'Using CUDA device %i\n' % cuda_device
            )

    sys.stdout.write('\n')

    if source_ont_file is not None and target_ont_file is not None:
        matcher = OntoEmma()
        matcher.align(
            model_type, model_path,
            source_ont_file, target_ont_file,
            input_alignment_file, output_alignment_file,
            align_strat, cuda_device
        )