def test_modh_match(self): source_ont_file = os.path.join(TEST_DATA, 'test_source_ont.json') target_ont_file = os.path.join(TEST_DATA, 'test_target_ont.json') ontoemma = OntoEmma() s_kb = ontoemma.load_kb(source_ont_file) s_kb.normalize_kb() t_kb = ontoemma.load_kb(target_ont_file) t_kb.normalize_kb() alignment = self.ontoemma._apply_modh_alignment_strategy(self.sim_scores, s_kb, t_kb) assert len(alignment) == 6
def query_all(self): """ Query all entities in input data file :return: """ try: ontoemma = OntoEmma() kb = ontoemma.load_kb(self.data_path) self.query_all_kb(kb) except Exception: try: self.query_all_training_data() except Exception: raise NotImplementedError( "Unknown file type, cannot enrich...")
class TestAssignmentStrategies(unittest.TestCase): sim_scores_fpath = os.path.join(TEST_DATA, 'test_sim_scores.pickle') sim_scores = pickle.load(open(sim_scores_fpath, 'rb')) ontoemma = OntoEmma() def test_best_match(self): alignment = self.ontoemma._apply_best_alignment_strategy(self.sim_scores) assert len(alignment) == 6 def test_all_match(self): alignment = self.ontoemma._apply_all_alignment_strategy(self.sim_scores) assert len(alignment) == 6 def test_modh_match(self): source_ont_file = os.path.join(TEST_DATA, 'test_source_ont.json') target_ont_file = os.path.join(TEST_DATA, 'test_target_ont.json') ontoemma = OntoEmma() s_kb = ontoemma.load_kb(source_ont_file) s_kb.normalize_kb() t_kb = ontoemma.load_kb(target_ont_file) t_kb.normalize_kb() alignment = self.ontoemma._apply_modh_alignment_strategy(self.sim_scores, s_kb, t_kb) assert len(alignment) == 6
class TestNeighborhoodSimilarity(unittest.TestCase): source_ont_file = os.path.join(TEST_DATA, 'test_source_ont.json') target_ont_file = os.path.join(TEST_DATA, 'test_target_ont.json') ontoemma = OntoEmma() s_kb = ontoemma.load_kb(source_ont_file) s_kb.normalize_kb() t_kb = ontoemma.load_kb(target_ont_file) t_kb.normalize_kb() sim_scores_fpath = os.path.join(TEST_DATA, 'test_sim_scores.pickle') sim_scores = pickle.load(open(sim_scores_fpath, 'rb')) def test_neighbor_similarity_null(self): neighborhood_sim_null = self.ontoemma._compute_neighborhood_similarities( self.sim_scores, self.s_kb, self.t_kb, 0) assert neighborhood_sim_null == self.sim_scores def test_neighborhood_similarity_oneiter(self): neighborhood_sim_one = self.ontoemma._compute_neighborhood_similarities( self.sim_scores, self.s_kb, self.t_kb, 1) assert len(neighborhood_sim_one) == len(self.sim_scores) assert neighborhood_sim_one != self.sim_scores def test_neighborhood_similarity_fiveiter(self): neighborhood_sim_five = self.ontoemma._compute_neighborhood_similarities( self.sim_scores, self.s_kb, self.t_kb, 5) assert len(neighborhood_sim_five) == len(self.sim_scores) assert neighborhood_sim_five != self.sim_scores
def test_lr(self): model_path = os.path.join(TEST_DATA, 'test_lr_model.pickle') source_ont_file = os.path.join(TEST_DATA, 'test_source_ont.json') target_ont_file = os.path.join(TEST_DATA, 'test_target_ont.json') input_alignment_file = os.path.join(TEST_DATA, 'test_input_alignment.tsv') output_alignment_file = os.path.join(TEST_DATA, 'test_output_alignment.tsv') matcher = OntoEmma() p, r, f1 = matcher.align('lr', model_path, source_ont_file, target_ont_file, input_alignment_file, output_alignment_file, -1) assert p >= 0.8 assert r >= 0.6 assert f1 >= 0.7
def test_nn(self): config_file = os.path.join(TEST_DATA, 'test_nn_config_file.json') model_path = os.path.join(TEST_DATA, 'test_nn_model') if os.path.exists(model_path): shutil.rmtree(model_path) matcher = OntoEmma() matcher.train('nn', model_path, config_file) assert (os.path.exists(os.path.join(model_path, 'model.tar.gz'))) model_path = os.path.join(TEST_DATA, 'test_nn_model', 'model.tar.gz') source_ont_file = os.path.join(TEST_DATA, 'test_source_ont.json') target_ont_file = os.path.join(TEST_DATA, 'test_target_ont.json') input_alignment_file = os.path.join(TEST_DATA, 'test_input_alignment.tsv') output_alignment_file = os.path.join(TEST_DATA, 'test_output_alignment.tsv') matcher = OntoEmma() p, r, f1 = matcher.align('nn', model_path, source_ont_file, target_ont_file, input_alignment_file, output_alignment_file, -1) assert p >= 0.0 assert r >= 0.0 assert f1 >= 0.0
def test_lr(self): config_file = os.path.join(TEST_DATA, 'test_lr_config_file.json') model_path = os.path.join(TEST_DATA, 'test_lr_model.pickle') if os.path.exists(model_path): os.remove(model_path) matcher = OntoEmma() matcher.train('lr', model_path, config_file) assert (os.path.exists(model_path)) source_ont_file = os.path.join(TEST_DATA, 'test_source_ont.json') target_ont_file = os.path.join(TEST_DATA, 'test_target_ont.json') input_alignment_file = os.path.join(TEST_DATA, 'test_input_alignment.tsv') output_alignment_file = os.path.join(TEST_DATA, 'test_output_alignment.tsv') matcher = OntoEmma() p, r, f1 = matcher.align('lr', model_path, source_ont_file, target_ont_file, input_alignment_file, output_alignment_file, "best", -1) assert p >= 0.8 assert r >= 0.6 assert f1 >= 0.7
def split_training_data(self): """ Process and split data into training development and test sets :return: """ all_kb_names = constants.TRAINING_KBS + constants.DEVELOPMENT_KBS training_file_dir = os.path.join(self.OUTPUT_DIR, 'training') output_training_data = os.path.join(self.TRAINING_DIR, 'ontoemma.context.train') output_development_data = os.path.join(self.TRAINING_DIR, 'ontoemma.context.dev') output_test_data = os.path.join(self.TRAINING_DIR, 'ontoemma.context.test') context_files = glob.glob( os.path.join(self.OUTPUT_KB_DIR, '*context.json')) context_kbs = [ os.path.basename(f).split('-')[1] for f in context_files ] training_files = glob.glob(os.path.join(training_file_dir, '*.tsv')) file_names = [ os.path.splitext(os.path.basename(f))[0] for f in training_files ] training_labels = [] training_dat = [] emma = OntoEmma() for fname, fpath in zip(file_names, training_files): (kb1_name, kb2_name) = fname.split('-') if kb1_name in all_kb_names and kb2_name in all_kb_names \ and kb1_name in context_kbs and kb2_name in context_kbs: sys.stdout.write("Processing %s and %s\n" % (kb1_name, kb2_name)) kb1 = emma.load_kb( os.path.join(self.OUTPUT_KB_DIR, 'kb-{}-context.json'.format(kb1_name))) kb2 = emma.load_kb( os.path.join(self.OUTPUT_KB_DIR, 'kb-{}-context.json'.format(kb2_name))) alignment = emma.load_alignment(fpath) for (e1, e2, score) in alignment: kb1_ent = kb1.get_entity_by_research_entity_id(e1) kb2_ent = kb2.get_entity_by_research_entity_id(e2) training_labels.append(int(score)) training_dat.append({ "source_entity": self._kb_entity_to_training_json(kb1_ent, kb1), "target_entity": self._kb_entity_to_training_json(kb2_ent, kb2) }) else: sys.stdout.write("Skipping %s and %s\n" % (kb1_name, kb2_name)) training_dat, test_dat, training_labels, test_labels = train_test_split( training_dat, training_labels, stratify=training_labels, test_size=constants.TEST_PART) training_dat, development_dat, training_labels, development_labels = train_test_split( training_dat, training_labels, stratify=training_labels, test_size=constants.DEVELOPMENT_PART) training_labels = self._replace_negative_labels(training_labels) development_labels = self._replace_negative_labels(development_labels) test_labels = self._replace_negative_labels(test_labels) with jsonlines.open(output_training_data, mode='w') as writer: for label, dat in zip(training_labels, training_dat): writer.write({ "label": label, "source_ent": dat["source_entity"], "target_ent": dat["target_entity"] }) with jsonlines.open(output_development_data, mode='w') as writer: for label, dat in zip(development_labels, development_dat): writer.write({ "label": label, "source_ent": dat["source_entity"], "target_ent": dat["target_entity"] }) with jsonlines.open(output_test_data, mode='w') as writer: for label, dat in zip(test_labels, test_dat): writer.write({ "label": label, "source_ent": dat["source_entity"], "target_ent": dat["target_entity"] }) return
def main(argv): model_path = None model_type = "nn" source_ont_file = None target_ont_file = None input_alignment_file = None output_alignment_file = None align_strat = "best" cuda_device = -1 sys.stdout.write('\n') sys.stdout.write('-------------------------\n') sys.stdout.write('OntoEMMA version 0.1 \n') sys.stdout.write('-------------------------\n') sys.stdout.write('https://github.com/allenai/ontoemma\n') sys.stdout.write('An ML-based ontology matcher to produce entity alignments between knowledgebases\n') sys.stdout.write('\n') try: nltk.data.find("corpora/stopwords") except LookupError: try: _create_unverified_https_context = ssl._create_unverified_context except AttributeError: pass else: ssl._create_default_https_context = _create_unverified_https_context nltk.download("stopwords") try: # TODO(waleeda): use argparse instead of getopt to parse command line arguments. opts, args = getopt.getopt( argv, "hs:t:i:o:m:p:g:a:", ["source=", "target=", "input=", "output=", "model_path=", "model_type=", "cuda_device=", "alignment_strategy="] ) except getopt.GetoptError: sys.stdout.write('Unknown option... -h or --help for help.\n') sys.exit(1) for opt, arg in opts: if opt in ("-h", "--help"): sys.stdout.write('Options: \n') sys.stdout.write('-s <source_ontology_file>\n') sys.stdout.write('-t <target_ontology_file>\n') sys.stdout.write('-i <input_alignment_file>\n') sys.stdout.write('-o <output_alignment_file>\n') sys.stdout.write('-m <model_location>\n') sys.stdout.write('-p <model_type>') sys.stdout.write('-g <cuda_device>') sys.stdout.write('-a <alignment_strategy>') sys.stdout.write('Example usage: \n') sys.stdout.write( ' ./run_ontoemma.py -s source_ont.json -t target_ont.json -i gold_alignment.tsv -o generated_alignment.tsv -m model_serialization_dir -p nn\n' ) sys.stdout.write('-------------------------\n') sys.stdout.write('Accepted KB file formats: json, pickle, owl\n') sys.stdout.write('Accepted alignment file formats: rdf, tsv\n') sys.stdout.write('Accepted model types (defaults to nn):\n') sys.stdout.write('\tnn (neural network)\n') sys.stdout.write('\tlr (logistic regression)\n') sys.stdout.write('Accepted alignment strategies (defaults to best):\n') sys.stdout.write('\tbest (best match per entity above threshold)\n') sys.stdout.write('\tall (all matches per entity above threshold)\n') sys.stdout.write('\tmodh (modified hungarian algorithm for assignment)\n') sys.stdout.write('Pretrained models can be found at:\n') sys.stdout.write(' /net/nfs.corp/s2-research/scigraph/ontoemma/') sys.stdout.write('-------------------------\n') sys.stdout.write('\n') sys.exit(0) elif opt in ("-s", "--source"): source_ont_file = os.path.abspath(arg) sys.stdout.write('Source ontology file is %s\n' % source_ont_file) elif opt in ("-t", "--target"): target_ont_file = os.path.abspath(arg) sys.stdout.write('Target ontology file is %s\n' % target_ont_file) elif opt in ("-i", "--input"): input_alignment_file = os.path.abspath(arg) sys.stdout.write( 'Input alignment file is %s\n' % input_alignment_file ) elif opt in ("-o", "--output"): output_alignment_file = os.path.abspath(arg) sys.stdout.write( 'Output alignment file is %s\n' % output_alignment_file ) elif opt in ("-m", "--model"): model_path = os.path.abspath(arg) elif opt in ("-p", "--model-type"): if arg in emma.constants.IMPLEMENTED_MODEL_TYPES: model_type = arg sys.stdout.write( 'Model type is %s\n' % emma.constants.IMPLEMENTED_MODEL_TYPES[model_type] ) else: sys.stdout.write('Error: Unknown model type...\n') sys.exit(1) elif opt in ("-a", "--alignment_method"): if arg in emma.constants.IMPLEMENTED_ALIGNMENT_STRATEGY: align_strat = arg sys.stdout.write( 'Alignment selection strategy is %s\n' % arg.upper() ) else: sys.stdout.write('Error: Unknown alignment selection strategy') elif opt in ("-g", "--cuda_device"): cuda_device = int(arg) sys.stdout.write( 'Using CUDA device %i\n' % cuda_device ) sys.stdout.write('\n') if source_ont_file is not None and target_ont_file is not None: matcher = OntoEmma() matcher.align( model_type, model_path, source_ont_file, target_ont_file, input_alignment_file, output_alignment_file, align_strat, cuda_device )
def main(argv): model_path = None model_type = "nn" config_file = None evaluate_flag = False evaluation_data_file = None cuda_device = -1 try: nltk.data.find("corpora/stopwords") except LookupError: try: _create_unverified_https_context = ssl._create_unverified_context except AttributeError: pass else: ssl._create_default_https_context = _create_unverified_https_context nltk.download("stopwords") nltk.download("wordnet") try: # TODO(waleeda): use argparse instead of getopt to parse command line arguments. opts, args = getopt.getopt(argv, "hec:m:p:d:g:", [ "config=", "model_path=", "model_type=", "evaluation_data_file=", "cuda_device=" ]) except getopt.GetoptError: sys.stdout.write('Unknown option... -h or --help for help.\n') sys.exit(1) for opt, arg in opts: if opt in ("-h", "--help"): sys.stdout.write('Options: \n') sys.stdout.write('-c <configuration_file>\n') sys.stdout.write('-m <model_location>\n') sys.stdout.write('-p <model_type>\n') sys.stdout.write('-e # evaluation mode\n') sys.stdout.write('-d <evaluation_data_file>\n') sys.stdout.write('-g <cuda_device>\n\n') sys.stdout.write('Example usages: \n') sys.stdout.write( ' ./train_ontoemma.py -c configuration_file.json -m model_file_path -p nn\n' ) sys.stdout.write( ' ./train_ontoemma.py -e -m model_file_path -d evaluation_data_path -g 5\n' ) sys.stdout.write('-------------------------\n') sys.stdout.write( 'Accepted model types: nn (neural network), lr (logistic regression)\n' ) sys.stdout.write('-------------------------\n') sys.stdout.write('\n') sys.exit(0) elif opt in ("-e", "--evaluate"): evaluate_flag = True sys.stdout.write('Evaluation mode\n') elif opt in ("-c", "--config"): config_file = os.path.abspath(arg) sys.stdout.write('Configuration file is %s\n' % config_file) elif opt in ("-m", "--model"): model_path = os.path.abspath(arg) sys.stdout.write('Model output path is %s\n' % model_path) elif opt in ("-p", "--model-type"): if arg in emma.constants.IMPLEMENTED_MODEL_TYPES: model_type = arg sys.stdout.write( 'Model type is %s\n' % emma.constants.IMPLEMENTED_MODEL_TYPES[model_type]) else: sys.stdout.write('Error: Unknown model type...\n') sys.exit(1) elif opt in ("-d", "--eval-data-file"): evaluation_data_file = os.path.abspath(arg) sys.stdout.write('Evaluation data file is %s\n' % evaluation_data_file) elif opt in ("-g", "--cuda-device"): cuda_device = int(arg) sys.stdout.write('Using CUDA device %i\n' % cuda_device) sys.stdout.write('\n') matcher = OntoEmma() if evaluate_flag: matcher.evaluate(model_type, model_path, evaluation_data_file, cuda_device) else: matcher.train(model_type, model_path, config_file)