def MEclassifier_model(self,compute_feature=False, is_train=True):
     '''
     Maximum entropy based classifier for Class Induction
     '''
     from oke.oak.FeatureFactory import FeatureFactory
     #load the train (goldstandard) data
     featureFactory = FeatureFactory()
     if compute_feature:
         #compute and export training features
         featureFactory.export_to_features('trainWithFeatures')
     else:
         print("skip computing features...")
     
     print('load features from \'trainWithFeatures.json\'... ')
     # write the updated data into JSON files
     datums = featureFactory.readData('trainWithFeatures.json')
     train_set = [(datum.features, datum.label) for datum in datums]
     print("train set size",len(train_set))
     
     if is_train:
         class_classifier=self.train(train_set)
     else:
         class_classifier=self.load_classifier_model(classifier_pickled="me_class_inducer.m")
     
     return class_classifier
    def MEclassifier_model(self, compute_feature=False, is_train=True):
        '''
        Maximum entropy based classifier for Class Induction
        '''
        from oke.oak.FeatureFactory import FeatureFactory
        #load the train (goldstandard) data
        featureFactory = FeatureFactory()
        if compute_feature:
            #compute and export training features
            featureFactory.export_to_features('trainWithFeatures')
        else:
            print("skip computing features...")

        print('load features from \'trainWithFeatures.json\'... ')
        # write the updated data into JSON files
        datums = featureFactory.readData('trainWithFeatures.json')
        train_set = [(datum.features, datum.label) for datum in datums]
        print("train set size", len(train_set))

        if is_train:
            class_classifier = self.train(train_set)
        else:
            class_classifier = self.load_classifier_model(
                classifier_pickled="me_class_inducer.m")

        return class_classifier
 def feature_extraction_for_prediction(self, graph_in_memory, context, context_sent):
     '''
     feature extraction for current context task in prediction phase
     '''
     from oke.oak.nif2rdfProcessor import NIF2RDFProcessor
     from oke.oak.FeatureFactory import FeatureFactory
     
     dataProcessor = NIF2RDFProcessor()
     featureFactory = FeatureFactory()
     
     context_data=dataProcessor.aggregate_context_data(graph_in_memory,context,context_sent)
     datums=featureFactory.compute_features(context_data)
     
     return (datums,context_data)
    def feature_extraction_for_prediction(self, graph_in_memory, context,
                                          context_sent):
        '''
        feature extraction for current context task in prediction phase
        '''
        from oke.oak.nif2rdfProcessor import NIF2RDFProcessor
        from oke.oak.FeatureFactory import FeatureFactory

        dataProcessor = NIF2RDFProcessor()
        featureFactory = FeatureFactory()

        context_data = dataProcessor.aggregate_context_data(
            graph_in_memory, context, context_sent)
        datums = featureFactory.compute_features(context_data)

        return (datums, context_data)
    def batch_ontology_alignment(self):
        '''
        ontology alignment for DOLCE+DnS Ultra Lite classes
            : query for dbpedia rdf types -> wordnet path similarity (is-a taxonomy) matching
        '''
        from oke.oak.FeatureFactory import FeatureFactory
        from oke.oak.util import extract_type_label
        import collections

        featureFactory = FeatureFactory()

        refsets = collections.defaultdict(set)
        testsets = collections.defaultdict(set)

        contextDict = self.dataProcessor.get_task_context(
            self.dataProcessor.graphData_goldstandards)
        entityset = set()
        dulclassset = set()
        without_duclass_num = 0

        true_positive = 0
        false_positive = 0
        true_negative = 0
        false_negative = 0

        for context, context_sent in contextDict.items():
            context_data = featureFactory.dataProcessor.aggregate_context_data(
                featureFactory.dataProcessor.graphData_goldstandards, context,
                context_sent)

            entity_dbpedia_URI = context_data.entity.taIdentRef
            entityClasses = context_data.entity.isInstOfEntityClasses

            labelled_class_type = [
                entityClass.subClassOf for entityClass in entityClasses
            ]
            print('labelled class type:', labelled_class_type)

            entity_class_labels = set(
                [entityClass.anchorOf for entityClass in entityClasses])

            entity_rdftypes = featureFactory.dbpedia_query_rdftypes(
                entity_dbpedia_URI)

            class_inst_rdftypes = featureFactory.dbpedia_query_deferencing_type(
                entity_class_labels)
            '''step 1: Linked Open Data Discovering: check if there is dul/d0 class already associated with entity and type (by dereferenceable URI)
            '''
            #http://www.ontologydesignpatterns.org/ont/d0.owl#Location
            entity_rdf_type_labels = set([
                extract_type_label(
                    featureFactory.get_URI_fragmentIdentifier(rdftype_uri))
                for rdftype_uri in entity_rdftypes
            ])
            #TODO: entity_class_rdf_type_labels

            # step 1: check whether there exist dul class already classified in DBpedia
            dulClass = [
                rdftype for rdftype in entity_rdftypes
                if self.is_dul_class(rdftype)
            ]

            entityset.add(context_data.entity.taIdentRef)
            testset = set()
            if len(dulClass) > 0 and dulClass[
                    0] in featureFactory.dul_ontology_classes.keys():
                dulclassset.add(dulClass[0])
                testset.add(dulClass[0])
            else:
                #'<',entity_dbpedia_URI,
                without_duclass_num += 1
                print(
                    str(without_duclass_num) +
                    '> do not have dul class pre-classified in DBpedia')

                entity_synset = set()
                entity_synset.update(entity_rdf_type_labels)
                entity_synset.update(entity_class_labels)

                aligned_type = self.schema_alignment_by_wordnet(
                    entity_synset, featureFactory.dul_ontology_classes)
                print("string similarity aligned type for [",
                      entity_class_labels, '] is [', aligned_type, ']')
                dulclassset.add(aligned_type)
                testset.add(aligned_type)

            print("labelled class type:", labelled_class_type)
            print("predicted class type:", testset)
            if (len(testset) > 0 and len(labelled_class_type) == 0):
                false_positive += 1
            elif (list(testset)[0] == list(labelled_class_type)[0]):
                true_positive += 1
            else:
                false_positive += 1

        print('precision:', true_positive / (true_positive + false_positive))
        print('entityset size:', len(entityset))
        print('existing dul class size:', len(dulclassset))
 def batch_ontology_alignment(self):
     '''
     ontology alignment for DOLCE+DnS Ultra Lite classes
         : query for dbpedia rdf types -> wordnet path similarity (is-a taxonomy) matching
     '''
     from oke.oak.FeatureFactory import FeatureFactory
     from oke.oak.util import extract_type_label
     import collections
     
     featureFactory = FeatureFactory()
     
     refsets = collections.defaultdict(set)
     testsets = collections.defaultdict(set)
     
     contextDict = self.dataProcessor.get_task_context(self.dataProcessor.graphData_goldstandards)
     entityset=set()
     dulclassset=set()
     without_duclass_num=0
     
     true_positive=0
     false_positive=0
     true_negative=0
     false_negative=0
     
     for context, context_sent in contextDict.items():
         context_data=featureFactory.dataProcessor.aggregate_context_data(featureFactory.dataProcessor.graphData_goldstandards,context,context_sent)
         
         entity_dbpedia_URI = context_data.entity.taIdentRef
         entityClasses = context_data.entity.isInstOfEntityClasses
         
         labelled_class_type = [entityClass.subClassOf for entityClass in entityClasses]
         print('labelled class type:',labelled_class_type)
         
         entity_class_labels=set([entityClass.anchorOf for entityClass in entityClasses])
         
         entity_rdftypes = featureFactory.dbpedia_query_rdftypes(entity_dbpedia_URI)
         
         class_inst_rdftypes=featureFactory.dbpedia_query_deferencing_type(entity_class_labels)
         
         '''step 1: Linked Open Data Discovering: check if there is dul/d0 class already associated with entity and type (by dereferenceable URI)
         '''
         #http://www.ontologydesignpatterns.org/ont/d0.owl#Location
         entity_rdf_type_labels=set([extract_type_label(featureFactory.get_URI_fragmentIdentifier(rdftype_uri)) for rdftype_uri in entity_rdftypes])
         #TODO: entity_class_rdf_type_labels
         
         # step 1: check whether there exist dul class already classified in DBpedia
         dulClass=[rdftype for rdftype in entity_rdftypes if self.is_dul_class(rdftype)]
         
         entityset.add(context_data.entity.taIdentRef)
         testset=set()
         if len(dulClass) > 0 and dulClass[0] in featureFactory.dul_ontology_classes.keys():
             dulclassset.add(dulClass[0])
             testset.add(dulClass[0])
         else:
             #'<',entity_dbpedia_URI, 
             without_duclass_num+=1
             print(str(without_duclass_num)+'> do not have dul class pre-classified in DBpedia')
             
             entity_synset=set()
             entity_synset.update(entity_rdf_type_labels)
             entity_synset.update(entity_class_labels)
             
             aligned_type = self.schema_alignment_by_wordnet(entity_synset,featureFactory.dul_ontology_classes)
             print("string similarity aligned type for [",entity_class_labels,'] is [',aligned_type,']')
             dulclassset.add(aligned_type)
             testset.add(aligned_type)            
             
         print("labelled class type:",labelled_class_type)
         print("predicted class type:",testset)
         if (len(testset) > 0 and len(labelled_class_type) == 0):
             false_positive+=1
         elif (list(testset)[0] == list(labelled_class_type)[0]):
             true_positive+=1
         else:
             false_positive+=1
     
     print('precision:', true_positive/(true_positive+false_positive))
     print('entityset size:', len(entityset))
     print('existing dul class size:', len(dulclassset))