def MEclassifier_model(self,compute_feature=False, is_train=True): ''' Maximum entropy based classifier for Class Induction ''' from oke.oak.FeatureFactory import FeatureFactory #load the train (goldstandard) data featureFactory = FeatureFactory() if compute_feature: #compute and export training features featureFactory.export_to_features('trainWithFeatures') else: print("skip computing features...") print('load features from \'trainWithFeatures.json\'... ') # write the updated data into JSON files datums = featureFactory.readData('trainWithFeatures.json') train_set = [(datum.features, datum.label) for datum in datums] print("train set size",len(train_set)) if is_train: class_classifier=self.train(train_set) else: class_classifier=self.load_classifier_model(classifier_pickled="me_class_inducer.m") return class_classifier
def MEclassifier_model(self, compute_feature=False, is_train=True): ''' Maximum entropy based classifier for Class Induction ''' from oke.oak.FeatureFactory import FeatureFactory #load the train (goldstandard) data featureFactory = FeatureFactory() if compute_feature: #compute and export training features featureFactory.export_to_features('trainWithFeatures') else: print("skip computing features...") print('load features from \'trainWithFeatures.json\'... ') # write the updated data into JSON files datums = featureFactory.readData('trainWithFeatures.json') train_set = [(datum.features, datum.label) for datum in datums] print("train set size", len(train_set)) if is_train: class_classifier = self.train(train_set) else: class_classifier = self.load_classifier_model( classifier_pickled="me_class_inducer.m") return class_classifier
def feature_extraction_for_prediction(self, graph_in_memory, context, context_sent): ''' feature extraction for current context task in prediction phase ''' from oke.oak.nif2rdfProcessor import NIF2RDFProcessor from oke.oak.FeatureFactory import FeatureFactory dataProcessor = NIF2RDFProcessor() featureFactory = FeatureFactory() context_data=dataProcessor.aggregate_context_data(graph_in_memory,context,context_sent) datums=featureFactory.compute_features(context_data) return (datums,context_data)
def feature_extraction_for_prediction(self, graph_in_memory, context, context_sent): ''' feature extraction for current context task in prediction phase ''' from oke.oak.nif2rdfProcessor import NIF2RDFProcessor from oke.oak.FeatureFactory import FeatureFactory dataProcessor = NIF2RDFProcessor() featureFactory = FeatureFactory() context_data = dataProcessor.aggregate_context_data( graph_in_memory, context, context_sent) datums = featureFactory.compute_features(context_data) return (datums, context_data)
def batch_ontology_alignment(self): ''' ontology alignment for DOLCE+DnS Ultra Lite classes : query for dbpedia rdf types -> wordnet path similarity (is-a taxonomy) matching ''' from oke.oak.FeatureFactory import FeatureFactory from oke.oak.util import extract_type_label import collections featureFactory = FeatureFactory() refsets = collections.defaultdict(set) testsets = collections.defaultdict(set) contextDict = self.dataProcessor.get_task_context( self.dataProcessor.graphData_goldstandards) entityset = set() dulclassset = set() without_duclass_num = 0 true_positive = 0 false_positive = 0 true_negative = 0 false_negative = 0 for context, context_sent in contextDict.items(): context_data = featureFactory.dataProcessor.aggregate_context_data( featureFactory.dataProcessor.graphData_goldstandards, context, context_sent) entity_dbpedia_URI = context_data.entity.taIdentRef entityClasses = context_data.entity.isInstOfEntityClasses labelled_class_type = [ entityClass.subClassOf for entityClass in entityClasses ] print('labelled class type:', labelled_class_type) entity_class_labels = set( [entityClass.anchorOf for entityClass in entityClasses]) entity_rdftypes = featureFactory.dbpedia_query_rdftypes( entity_dbpedia_URI) class_inst_rdftypes = featureFactory.dbpedia_query_deferencing_type( entity_class_labels) '''step 1: Linked Open Data Discovering: check if there is dul/d0 class already associated with entity and type (by dereferenceable URI) ''' #http://www.ontologydesignpatterns.org/ont/d0.owl#Location entity_rdf_type_labels = set([ extract_type_label( featureFactory.get_URI_fragmentIdentifier(rdftype_uri)) for rdftype_uri in entity_rdftypes ]) #TODO: entity_class_rdf_type_labels # step 1: check whether there exist dul class already classified in DBpedia dulClass = [ rdftype for rdftype in entity_rdftypes if self.is_dul_class(rdftype) ] entityset.add(context_data.entity.taIdentRef) testset = set() if len(dulClass) > 0 and dulClass[ 0] in featureFactory.dul_ontology_classes.keys(): dulclassset.add(dulClass[0]) testset.add(dulClass[0]) else: #'<',entity_dbpedia_URI, without_duclass_num += 1 print( str(without_duclass_num) + '> do not have dul class pre-classified in DBpedia') entity_synset = set() entity_synset.update(entity_rdf_type_labels) entity_synset.update(entity_class_labels) aligned_type = self.schema_alignment_by_wordnet( entity_synset, featureFactory.dul_ontology_classes) print("string similarity aligned type for [", entity_class_labels, '] is [', aligned_type, ']') dulclassset.add(aligned_type) testset.add(aligned_type) print("labelled class type:", labelled_class_type) print("predicted class type:", testset) if (len(testset) > 0 and len(labelled_class_type) == 0): false_positive += 1 elif (list(testset)[0] == list(labelled_class_type)[0]): true_positive += 1 else: false_positive += 1 print('precision:', true_positive / (true_positive + false_positive)) print('entityset size:', len(entityset)) print('existing dul class size:', len(dulclassset))
def batch_ontology_alignment(self): ''' ontology alignment for DOLCE+DnS Ultra Lite classes : query for dbpedia rdf types -> wordnet path similarity (is-a taxonomy) matching ''' from oke.oak.FeatureFactory import FeatureFactory from oke.oak.util import extract_type_label import collections featureFactory = FeatureFactory() refsets = collections.defaultdict(set) testsets = collections.defaultdict(set) contextDict = self.dataProcessor.get_task_context(self.dataProcessor.graphData_goldstandards) entityset=set() dulclassset=set() without_duclass_num=0 true_positive=0 false_positive=0 true_negative=0 false_negative=0 for context, context_sent in contextDict.items(): context_data=featureFactory.dataProcessor.aggregate_context_data(featureFactory.dataProcessor.graphData_goldstandards,context,context_sent) entity_dbpedia_URI = context_data.entity.taIdentRef entityClasses = context_data.entity.isInstOfEntityClasses labelled_class_type = [entityClass.subClassOf for entityClass in entityClasses] print('labelled class type:',labelled_class_type) entity_class_labels=set([entityClass.anchorOf for entityClass in entityClasses]) entity_rdftypes = featureFactory.dbpedia_query_rdftypes(entity_dbpedia_URI) class_inst_rdftypes=featureFactory.dbpedia_query_deferencing_type(entity_class_labels) '''step 1: Linked Open Data Discovering: check if there is dul/d0 class already associated with entity and type (by dereferenceable URI) ''' #http://www.ontologydesignpatterns.org/ont/d0.owl#Location entity_rdf_type_labels=set([extract_type_label(featureFactory.get_URI_fragmentIdentifier(rdftype_uri)) for rdftype_uri in entity_rdftypes]) #TODO: entity_class_rdf_type_labels # step 1: check whether there exist dul class already classified in DBpedia dulClass=[rdftype for rdftype in entity_rdftypes if self.is_dul_class(rdftype)] entityset.add(context_data.entity.taIdentRef) testset=set() if len(dulClass) > 0 and dulClass[0] in featureFactory.dul_ontology_classes.keys(): dulclassset.add(dulClass[0]) testset.add(dulClass[0]) else: #'<',entity_dbpedia_URI, without_duclass_num+=1 print(str(without_duclass_num)+'> do not have dul class pre-classified in DBpedia') entity_synset=set() entity_synset.update(entity_rdf_type_labels) entity_synset.update(entity_class_labels) aligned_type = self.schema_alignment_by_wordnet(entity_synset,featureFactory.dul_ontology_classes) print("string similarity aligned type for [",entity_class_labels,'] is [',aligned_type,']') dulclassset.add(aligned_type) testset.add(aligned_type) print("labelled class type:",labelled_class_type) print("predicted class type:",testset) if (len(testset) > 0 and len(labelled_class_type) == 0): false_positive+=1 elif (list(testset)[0] == list(labelled_class_type)[0]): true_positive+=1 else: false_positive+=1 print('precision:', true_positive/(true_positive+false_positive)) print('entityset size:', len(entityset)) print('existing dul class size:', len(dulclassset))