def schema_alignment_by_wordnet(self, entity_synset, dul_ontology_classes):
        '''
        wordnet is-a taxonomy path similarity for alignment
        
        The intuition is that head-word carry important information about concept.
        Dbpedia classification can enrich more meaningful type sometimes with the same words with the Ontology that needs to be aligned.
        With the set of enriched "keywords" about entity and local schema types, we can iteratively compare the maximum similarity.
        By applying a threshold, we can choose a ontology class with maximum likelihood.
        
        params:
        entity_sim - set() contains representative labels about entity and types mentioned in context
        dul_ontology_classes - dict() contains dul classes and representative labels
        '''
        from oke.oak.util import wordnet_shortest_path
        most_similiar_dul_class = dict()
        for entity_label in entity_synset:
            entity_label_headword = entity_label.split(' ')[-1:][0]
            for classUri, classLabels in dul_ontology_classes.items():
                max_sim = max([
                    wordnet_shortest_path(entity_label_headword, class_label)
                    for class_label in classLabels
                ])

                if most_similiar_dul_class.get(
                        classUri) is None or most_similiar_dul_class.get(
                            classUri) < max_sim:
                    most_similiar_dul_class[classUri] = max_sim

        #choose a most similar one
        suggested_class = max(most_similiar_dul_class,
                              key=most_similiar_dul_class.get)
        suggested_class_prob = most_similiar_dul_class.get(suggested_class)

        return suggested_class if suggested_class_prob > 0.0 else None
 def schema_alignment_by_wordnet(self, entity_synset, dul_ontology_classes):
     '''
     wordnet is-a taxonomy path similarity for alignment
     
     The intuition is that head-word carry important information about concept.
     Dbpedia classification can enrich more meaningful type sometimes with the same words with the Ontology that needs to be aligned.
     With the set of enriched "keywords" about entity and local schema types, we can iteratively compare the maximum similarity.
     By applying a threshold, we can choose a ontology class with maximum likelihood.
     
     params:
     entity_sim - set() contains representative labels about entity and types mentioned in context
     dul_ontology_classes - dict() contains dul classes and representative labels
     '''
     from oke.oak.util import wordnet_shortest_path
     most_similiar_dul_class=dict()
     for entity_label in entity_synset:
         entity_label_headword=entity_label.split(' ')[-1:][0]            
         for classUri,classLabels in dul_ontology_classes.items():
             max_sim = max([wordnet_shortest_path(entity_label_headword,class_label) for class_label in classLabels])
             
             if most_similiar_dul_class.get(classUri) is None or most_similiar_dul_class.get(classUri) < max_sim:                    
                 most_similiar_dul_class[classUri]=max_sim
                 
     #choose a most similar one
     suggested_class= max(most_similiar_dul_class, key=most_similiar_dul_class.get)
     suggested_class_prob = most_similiar_dul_class.get(suggested_class)
     
     return suggested_class if suggested_class_prob> 0.0 else None        
    def semantic_similarity_for_alignment(self, entity_class_labels,
                                          related_lod_rdf_type_labels):
        '''
        if alignment decision cannot be made, 
        entity class labels and discovered DBpedia RDF type lables will be combined compute Wordnet path similarity (is-a) with DUL ontology class & expanded keywords and synonyms
        
        Maybe Avoid to compare multi-word terms as too much noisy in LOD types
        Use head noun for multi-word terms to maximise the possibilities
        
        Choose the DUL class with maximum probability
        
        params:
        entityClasses_labels : set
        all_rdf_types_labels : set
        return string, suggested DUL classes URI
        return "" if no DUL classes matched
        '''
        from oke.oak.util import wordnet_shortest_path
        most_similiar_dul_class = dict()
        for entity_class_label in entity_class_labels:
            #use head noun
            entity_class_label_headnoun = entity_class_label.split(' ')[-1:][0]
            splitted_labels = entity_class_label.split(' ')
            if (len(splitted_labels) > 1):
                continue

            for classUri, classLabels in self.dul_ontology_classes.items():
                #compare similarity path distance between head noun of class label with keywords of the ontology class
                #only preserve the maximum value of matched pairs
                #print("compare ",entity_class_label,"with ["+",".join(classLabels)+"]")
                max_sim = max([
                    wordnet_shortest_path(entity_class_label, class_label)
                    for class_label in classLabels
                ])
                if most_similiar_dul_class.get(
                        classUri) is None or most_similiar_dul_class.get(
                            classUri) < max_sim:
                    most_similiar_dul_class[classUri] = max_sim

        #choose a most similar DUL class matched
        if len(most_similiar_dul_class) > 1:
            suggested_alignment_class = max(most_similiar_dul_class,
                                            key=most_similiar_dul_class.get)
            suggested_class_prob = most_similiar_dul_class.get(
                suggested_alignment_class)
            similarity_threshold = 0.1

            print("suggested_alignment_class from semantic computation:",
                  suggested_alignment_class, " , probability:",
                  suggested_class_prob)
            return suggested_alignment_class if suggested_class_prob > similarity_threshold else ""
        return ""
 def semantic_similarity_for_alignment(self, entity_class_labels, related_lod_rdf_type_labels):
     '''
     if alignment decision cannot be made, 
     entity class labels and discovered DBpedia RDF type lables will be combined compute Wordnet path similarity (is-a) with DUL ontology class & expanded keywords and synonyms
     
     Maybe Avoid to compare multi-word terms as too much noisy in LOD types
     Use head noun for multi-word terms to maximise the possibilities
     
     Choose the DUL class with maximum probability
     
     params:
     entityClasses_labels : set
     all_rdf_types_labels : set
     return string, suggested DUL classes URI
     return "" if no DUL classes matched
     '''
     from oke.oak.util import wordnet_shortest_path
     most_similiar_dul_class=dict()
     for entity_class_label in entity_class_labels:
         #use head noun
         entity_class_label_headnoun=entity_class_label.split(' ')[-1:][0]
         splitted_labels=entity_class_label.split(' ')
         if(len(splitted_labels) > 1):
             continue
         
         for classUri,classLabels in self.dul_ontology_classes.items():
             #compare similarity path distance between head noun of class label with keywords of the ontology class
             #only preserve the maximum value of matched pairs
             #print("compare ",entity_class_label,"with ["+",".join(classLabels)+"]")
             max_sim = max([wordnet_shortest_path(entity_class_label,class_label) for class_label in classLabels])
             if most_similiar_dul_class.get(classUri) is None or most_similiar_dul_class.get(classUri) < max_sim:
                 most_similiar_dul_class[classUri]=max_sim
                 
     #choose a most similar DUL class matched
     if len(most_similiar_dul_class) > 1:
         suggested_alignment_class= max(most_similiar_dul_class, key=most_similiar_dul_class.get)
         suggested_class_prob = most_similiar_dul_class.get(suggested_alignment_class)
         similarity_threshold=0.1
         
         print("suggested_alignment_class from semantic computation:",suggested_alignment_class, " , probability:",suggested_class_prob)
         return suggested_alignment_class if suggested_class_prob> similarity_threshold else ""
     return ""
    def compute_features(self, context_data):
        '''
        Maximum entropy model gives a better performance for sequence labelling problem. 
        By maximizing the entropy in our model, we are attempting to minimise the amount of the information the model carries.
        Design a language model to maximise the entropy and 
            feed our language model with a set of features associated with a given token we wish to classify
            and the system can then given us the probability that our token falls into any given class of token against which our language model was trained.
        '''
        from oke.oak.util import wordnet_shortest_path
        from oke.oak.util import extract_type_label
        from oke.oak.util import get_URI_fragmentIdentifier
        from oke.oak.util import contains_digits
        
        #words, contextURI, previousLabel, position
        if type(context_data) is not TaskContext:
            raise Exception('Type error: context_data must be the instance of oke.oak.TaskContext')
        
        context_words=word_tokenize(context_data.isString)
        tagged_context=pos_tag(context_words)
        sem_tagged_context=self.sem_tag(context_words,context_data)        
        
        entity_name=context_data.entity.anchorOf
        entity_head_word=entity_name.split(' ')[-1:][0]
        entity_dbpedia_URI = context_data.entity.taIdentRef
        #print("entity_dbpedia_URI:"+entity_dbpedia_URI)
        '''
        LOD based semantic type feature:
        '''
        entity_rdftypes=self.entity_rdftypes_feature_extraction(entity_dbpedia_URI)         
        
        if (len (entity_rdftypes) == 0):
            print("Warn: No rdf types can be found for [current word")#entity_name.decode("utf8"),"]")
        # extract labels from RDF type
        entity_semantics=set()
        entity_semantics.update(set([extract_type_label(get_URI_fragmentIdentifier(rdftype_uri)) for rdftype_uri in entity_rdftypes]))
        
        #print('sem_tagged_context:',sem_tagged_context)
        #add head word into rdf type
        #  to avoid adding head word into rdf type: not many head word represent essential word associated with type
        #entity_semantics.add(entity_head_word)
        #print("entity_semantics:",entity_semantics)
        datums=[]
        
        #compute features for each word
        #use sliding window to observe on both left and right hand side
        currentIndex=0
        sliding_window_prev_n_words=8
        sliding_window_next_n_words=3
        
        for tagged_word in tagged_context:
            currentWord=tagged_word[0]
            #label encoding
            currentWord_label='O' if sem_tagged_context[currentIndex][1] !='class' else 'class'
            datum = Datum(context_data.contextURI,currentWord,currentWord_label)
            
            datum.previousLabel=datums[currentIndex-1].label if (currentIndex-1) in range(0,len(datums)) else 'None'
            
            features={}
            #word-level features (part-of-speech, case, punctuation,digit,morphology)
            import string
            if currentWord.lower() not in self.stoplist and currentWord not in string.punctuation and currentWord.isdigit() is not True and tagged_word[1] in ["NN", "NNP", "NNS"]:
                #use lemmatised word
                features["word"]= self.wordnet_lemmatizer.lemmatize(currentWord, pos='n')
                #Word sense of Noun: we can use "WN_CLASS" to determine whether the NN word is a hyponym of w (or keywords) in ontology by wordnet
                #features["WN_CLASS"]=
            features["word_pos"]=tagged_word[1]
            #features["word_root"]=self.wordnet_lemmatizer.lemmatize(currentWord, pos='n')
            features["is_title"]=str(currentWord).istitle()
            features['all_capital']=currentWord.isupper()
            features["is_word_root_be"]='Y' if self.wordnet_lemmatizer.lemmatize(currentWord, pos='v') == 'be' else 'N'
            features['is_punct_comma']='Y' if str(currentWord) == ',' else 'N'
            features['word_with_digits']='Y' if tagged_word[1]!='CD' and contains_digits(str(currentWord)) else 'N'         
            features["is_StopWord"]='Y' if currentWord in self.stoplist else 'N'
            features["is_Entity"]='N' if sem_tagged_context[currentIndex] !='entity' else 'Y'
            features["last_2_letters"]='None' if len(str(currentWord))<=2 or str(currentWord).isdigit() else str(currentWord)[-2:]
            #type_indicator can be retrieved by wordnet synonyms
            features["type_indicator"]='Y' if currentWord in ['name','form','type','class','category', 'variety', 'style','model','substance', 'version', 'genre','matter','mound', 'kind', 'shade', 'substance'] else 'N'
            
            #semantic (gazetteer lookup) features
            features["is_orgKey"] ='Y' if currentWord.lower() in self.gaz_org_key else 'N'
            features["is_locKey"] = 'Y' if currentWord.lower() in self.gaz_loc_key else 'N'
            features["is_country"] = 'Y' if currentWord.lower() in self.gaz_country else 'N'
            features["is_countryAdj"]='Y' if currentWord.lower in self.gaz_countryAdj else 'N'
            features["is_personName"] = 'Y' if currentWord.lower() in self.gaz_person_name else 'N'
            features["is_personTitle"] = 'Y' if currentWord.lower() in self.gaz_person_title else 'N'
            features['is_jobtitle']='Y' if currentWord.lower() in self.gaz_job_title else 'N'
            features['is_facKey']='Y' if currentWord.lower() in self.gaz_facility_key else 'N'
            
            #add feature to compute path similarity between dbpedia type and current word
            
            if entity_semantics:
                max_sim = max([wordnet_shortest_path(currentWord,sem_type.split(' ')[-1:][0]) for sem_type in entity_semantics])
                features['sim_dist_with_DbpediaType'] = max_sim
            
            
            for last_i in range(1,sliding_window_prev_n_words+1):
                if currentIndex == 0:
                    features['prev_word']="<START>"
                
                if currentIndex != 0 and currentIndex-last_i >=0:                    
                    #features['prev_'+str(last_i)+'_word']=datums[currentIndex-last_i].features['word'] if (currentIndex-last_i) in range(0,len(datums)) else 'None'
                    features['prev_'+str(last_i)+'_word_pos']=datums[currentIndex-last_i].features['word_pos'] if (currentIndex-last_i) in range(0,len(datums)) else 'None'
                    #features['prev_'+str(last_i)+'_word_root']=datums[currentIndex-last_i].features['word_root'] if (currentIndex-last_i) in range(0,len(datums)) else 'None'                
                    features['prev_'+str(last_i)+'_word_is_StopWord']=datums[currentIndex-last_i].features['is_StopWord'] if (currentIndex-last_i) in range(0,len(datums)) else 'None'         
                    features['prev_'+str(last_i)+'_word_is_Entity']=datums[currentIndex-last_i].features['is_Entity'] if (currentIndex-last_i) in range(0,len(datums)) else 'None'
                    features['prev_'+str(last_i)+'_word_is_title']=datums[currentIndex-last_i].features['is_title'] if (currentIndex-last_i) in range(0,len(datums)) else 'None'
                    features['prev_'+str(last_i)+'_word_all_capital']=datums[currentIndex-last_i].features['all_capital'] if (currentIndex-last_i) in range(0,len(datums)) else 'None'
                    features['prev_'+str(last_i)+'_word_is_word_root_be']=datums[currentIndex-last_i].features['is_word_root_be'] if (currentIndex-last_i) in range(0,len(datums)) else 'None'
                    features['prev_'+str(last_i)+'_word_is_punct_comma']=datums[currentIndex-last_i].features['is_punct_comma'] if (currentIndex-last_i) in range(0,len(datums)) else 'None'
                    features['prev_'+str(last_i)+'_word_word_with_digits']=datums[currentIndex-last_i].features['word_with_digits'] if (currentIndex-last_i) in range(0,len(datums)) else 'None'
                    features['prev_'+str(last_i)+'_word_last_2_letters']=datums[currentIndex-last_i].features['last_2_letters'] if (currentIndex-last_i) in range(0,len(datums)) else 'None'
                    features['prev_'+str(last_i)+'_word_type_indicator']=datums[currentIndex-last_i].features['type_indicator'] if (currentIndex-last_i) in range(0,len(datums)) else 'None'
                    features['prev_'+str(last_i)+'_word_is_orgKey']=datums[currentIndex-last_i].features['is_orgKey'] if (currentIndex-last_i) in range(0,len(datums)) else 'None'
                    features['prev_'+str(last_i)+'_word_is_locKey']=datums[currentIndex-last_i].features['is_locKey'] if (currentIndex-last_i) in range(0,len(datums)) else 'None'
                    features['prev_'+str(last_i)+'_word_is_country']=datums[currentIndex-last_i].features['is_country'] if (currentIndex-last_i) in range(0,len(datums)) else 'None'
                    features['prev_'+str(last_i)+'_word_is_countryAdj']=datums[currentIndex-last_i].features['is_countryAdj'] if (currentIndex-last_i) in range(0,len(datums)) else 'None'
                    features['prev_'+str(last_i)+'_word_is_personName']=datums[currentIndex-last_i].features['is_personName'] if (currentIndex-last_i) in range(0,len(datums)) else 'None'
                    features['prev_'+str(last_i)+'_word_is_personTitle']=datums[currentIndex-last_i].features['is_personTitle'] if (currentIndex-last_i) in range(0,len(datums)) else 'None'
                    features['prev_'+str(last_i)+'_word_is_facKey']=datums[currentIndex-last_i].features['is_facKey'] if (currentIndex-last_i) in range(0,len(datums)) else 'None'
                
            datum.features=features
            currentIndex+=1
            datums.append(datum)
        
        #add features about next words
        #reset to 0
        currentIndex = 0
        for tagged_word in tagged_context:
            for next_i in range(1, sliding_window_next_n_words+1):
                if ((currentIndex+next_i) == len(datums)):
                    datums[currentIndex].features['next_word']="<END>"
                
                if (currentIndex+next_i) != len(datums) :
                    #datums[currentIndex].features['next_'+str(next_i)+'_word']=datums[currentIndex+next_i].features['word'] if (currentIndex+next_i) in range(0,len(datums)) else 'None'
                    datums[currentIndex].features['next_'+str(next_i)+'_word_pos']=datums[currentIndex+next_i].features['word_pos'] if (currentIndex+next_i) in range(0,len(datums)) else 'None'
                
                    datums[currentIndex].features['next_'+str(next_i)+'_word_is_StopWord']=datums[currentIndex+next_i].features['is_StopWord'] if (currentIndex+next_i) in range(0,len(datums)) else 'None'
                    datums[currentIndex].features['next_'+str(next_i)+'_word_is_Entity']=datums[currentIndex+next_i].features['is_Entity'] if (currentIndex+next_i) in range(0,len(datums)) else 'None'

                    datums[currentIndex].features['next_'+str(next_i)+'_word_is_title']=datums[currentIndex+next_i].features['is_title'] if (currentIndex+next_i) in range(0,len(datums)) else 'None'
                    datums[currentIndex].features['next_'+str(next_i)+'_word_all_capital']=datums[currentIndex+next_i].features['all_capital'] if (currentIndex+next_i) in range(0,len(datums)) else 'None'
                    datums[currentIndex].features['next_'+str(next_i)+'_word_is_word_root_be']=datums[currentIndex+next_i].features['is_word_root_be'] if (currentIndex+next_i) in range(0,len(datums)) else 'None'
                    datums[currentIndex].features['next_'+str(next_i)+'_word_is_punct_comma']=datums[currentIndex+next_i].features['is_punct_comma'] if (currentIndex+next_i) in range(0,len(datums)) else 'None'
                    datums[currentIndex].features['next_'+str(next_i)+'_word_word_with_digits']=datums[currentIndex+next_i].features['word_with_digits'] if (currentIndex+next_i) in range(0,len(datums)) else 'None'
                    datums[currentIndex].features['next_'+str(next_i)+'_word_last_2_letters']=datums[currentIndex+next_i].features['last_2_letters'] if (currentIndex+next_i) in range(0,len(datums)) else 'None'
                    datums[currentIndex].features['next_'+str(next_i)+'_word_type_indicator']=datums[currentIndex+next_i].features['type_indicator'] if (currentIndex+next_i) in range(0,len(datums)) else 'None'
                    datums[currentIndex].features['next_'+str(next_i)+'_word_is_orgKey']=datums[currentIndex+next_i].features['is_orgKey'] if (currentIndex+next_i) in range(0,len(datums)) else 'None'
                    datums[currentIndex].features['next_'+str(next_i)+'_word_is_locKey']=datums[currentIndex+next_i].features['is_locKey'] if (currentIndex+next_i) in range(0,len(datums)) else 'None'
                    datums[currentIndex].features['next_'+str(next_i)+'_word_is_country']=datums[currentIndex+next_i].features['is_country'] if (currentIndex+next_i) in range(0,len(datums)) else 'None'
                    datums[currentIndex].features['next_'+str(next_i)+'_word_is_countryAdj']=datums[currentIndex+next_i].features['is_countryAdj'] if (currentIndex+next_i) in range(0,len(datums)) else 'None'
                    datums[currentIndex].features['next_'+str(next_i)+'_word_is_personName']=datums[currentIndex+next_i].features['is_personName'] if (currentIndex+next_i) in range(0,len(datums)) else 'None'
                    datums[currentIndex].features['next_'+str(next_i)+'_word_is_personTitle']=datums[currentIndex+next_i].features['is_personTitle'] if (currentIndex+next_i) in range(0,len(datums)) else 'None'
                    datums[currentIndex].features['next_'+str(next_i)+'_word_is_facKey']=datums[currentIndex+next_i].features['is_facKey'] if (currentIndex+next_i) in range(0,len(datums)) else 'None'
            currentIndex+=1
            
        return datums
예제 #6
0
    def compute_features(self, context_data):
        '''
        Maximum entropy model gives a better performance for sequence labelling problem. 
        By maximizing the entropy in our model, we are attempting to minimise the amount of the information the model carries.
        Design a language model to maximise the entropy and 
            feed our language model with a set of features associated with a given token we wish to classify
            and the system can then given us the probability that our token falls into any given class of token against which our language model was trained.
        '''
        from oke.oak.util import wordnet_shortest_path
        from oke.oak.util import extract_type_label
        from oke.oak.util import get_URI_fragmentIdentifier
        from oke.oak.util import contains_digits

        #words, contextURI, previousLabel, position
        if type(context_data) is not TaskContext:
            raise Exception(
                'Type error: context_data must be the instance of oke.oak.TaskContext'
            )

        context_words = word_tokenize(context_data.isString)
        tagged_context = pos_tag(context_words)
        sem_tagged_context = self.sem_tag(context_words, context_data)

        entity_name = context_data.entity.anchorOf
        entity_head_word = entity_name.split(' ')[-1:][0]
        entity_dbpedia_URI = context_data.entity.taIdentRef
        #print("entity_dbpedia_URI:"+entity_dbpedia_URI)
        '''
        LOD based semantic type feature:
        '''
        entity_rdftypes = self.entity_rdftypes_feature_extraction(
            entity_dbpedia_URI)

        if (len(entity_rdftypes) == 0):
            print("Warn: No rdf types can be found for [current word"
                  )  #entity_name.decode("utf8"),"]")
        # extract labels from RDF type
        entity_semantics = set()
        entity_semantics.update(
            set([
                extract_type_label(get_URI_fragmentIdentifier(rdftype_uri))
                for rdftype_uri in entity_rdftypes
            ]))

        #print('sem_tagged_context:',sem_tagged_context)
        #add head word into rdf type
        #  to avoid adding head word into rdf type: not many head word represent essential word associated with type
        #entity_semantics.add(entity_head_word)
        #print("entity_semantics:",entity_semantics)
        datums = []

        #compute features for each word
        #use sliding window to observe on both left and right hand side
        currentIndex = 0
        sliding_window_prev_n_words = 8
        sliding_window_next_n_words = 3

        for tagged_word in tagged_context:
            currentWord = tagged_word[0]
            #label encoding
            currentWord_label = 'O' if sem_tagged_context[currentIndex][
                1] != 'class' else 'class'
            datum = Datum(context_data.contextURI, currentWord,
                          currentWord_label)

            datum.previousLabel = datums[currentIndex - 1].label if (
                currentIndex - 1) in range(0, len(datums)) else 'None'

            features = {}
            #word-level features (part-of-speech, case, punctuation,digit,morphology)
            import string
            if currentWord.lower(
            ) not in self.stoplist and currentWord not in string.punctuation and currentWord.isdigit(
            ) is not True and tagged_word[1] in ["NN", "NNP", "NNS"]:
                #use lemmatised word
                features["word"] = self.wordnet_lemmatizer.lemmatize(
                    currentWord, pos='n')
                #Word sense of Noun: we can use "WN_CLASS" to determine whether the NN word is a hyponym of w (or keywords) in ontology by wordnet
                #features["WN_CLASS"]=
            features["word_pos"] = tagged_word[1]
            #features["word_root"]=self.wordnet_lemmatizer.lemmatize(currentWord, pos='n')
            features["is_title"] = str(currentWord).istitle()
            features['all_capital'] = currentWord.isupper()
            features[
                "is_word_root_be"] = 'Y' if self.wordnet_lemmatizer.lemmatize(
                    currentWord, pos='v') == 'be' else 'N'
            features['is_punct_comma'] = 'Y' if str(
                currentWord) == ',' else 'N'
            features['word_with_digits'] = 'Y' if tagged_word[
                1] != 'CD' and contains_digits(str(currentWord)) else 'N'
            features[
                "is_StopWord"] = 'Y' if currentWord in self.stoplist else 'N'
            features["is_Entity"] = 'N' if sem_tagged_context[
                currentIndex] != 'entity' else 'Y'
            features["last_2_letters"] = 'None' if len(
                str(currentWord)) <= 2 or str(currentWord).isdigit() else str(
                    currentWord)[-2:]
            #type_indicator can be retrieved by wordnet synonyms
            features["type_indicator"] = 'Y' if currentWord in [
                'name', 'form', 'type', 'class', 'category', 'variety',
                'style', 'model', 'substance', 'version', 'genre', 'matter',
                'mound', 'kind', 'shade', 'substance'
            ] else 'N'

            #semantic (gazetteer lookup) features
            features["is_orgKey"] = 'Y' if currentWord.lower(
            ) in self.gaz_org_key else 'N'
            features["is_locKey"] = 'Y' if currentWord.lower(
            ) in self.gaz_loc_key else 'N'
            features["is_country"] = 'Y' if currentWord.lower(
            ) in self.gaz_country else 'N'
            features[
                "is_countryAdj"] = 'Y' if currentWord.lower in self.gaz_countryAdj else 'N'
            features["is_personName"] = 'Y' if currentWord.lower(
            ) in self.gaz_person_name else 'N'
            features["is_personTitle"] = 'Y' if currentWord.lower(
            ) in self.gaz_person_title else 'N'
            features['is_jobtitle'] = 'Y' if currentWord.lower(
            ) in self.gaz_job_title else 'N'
            features['is_facKey'] = 'Y' if currentWord.lower(
            ) in self.gaz_facility_key else 'N'

            #add feature to compute path similarity between dbpedia type and current word

            if entity_semantics:
                max_sim = max([
                    wordnet_shortest_path(currentWord,
                                          sem_type.split(' ')[-1:][0])
                    for sem_type in entity_semantics
                ])
                features['sim_dist_with_DbpediaType'] = max_sim

            for last_i in range(1, sliding_window_prev_n_words + 1):
                if currentIndex == 0:
                    features['prev_word'] = "<START>"

                if currentIndex != 0 and currentIndex - last_i >= 0:
                    #features['prev_'+str(last_i)+'_word']=datums[currentIndex-last_i].features['word'] if (currentIndex-last_i) in range(0,len(datums)) else 'None'
                    features['prev_' + str(last_i) + '_word_pos'] = datums[
                        currentIndex - last_i].features['word_pos'] if (
                            currentIndex -
                            last_i) in range(0, len(datums)) else 'None'
                    #features['prev_'+str(last_i)+'_word_root']=datums[currentIndex-last_i].features['word_root'] if (currentIndex-last_i) in range(0,len(datums)) else 'None'
                    features['prev_' + str(last_i) +
                             '_word_is_StopWord'] = datums[
                                 currentIndex -
                                 last_i].features['is_StopWord'] if (
                                     currentIndex - last_i) in range(
                                         0, len(datums)) else 'None'
                    features['prev_' + str(last_i) +
                             '_word_is_Entity'] = datums[
                                 currentIndex -
                                 last_i].features['is_Entity'] if (
                                     currentIndex - last_i) in range(
                                         0, len(datums)) else 'None'
                    features['prev_' + str(last_i) +
                             '_word_is_title'] = datums[
                                 currentIndex -
                                 last_i].features['is_title'] if (
                                     currentIndex - last_i) in range(
                                         0, len(datums)) else 'None'
                    features['prev_' + str(last_i) +
                             '_word_all_capital'] = datums[
                                 currentIndex -
                                 last_i].features['all_capital'] if (
                                     currentIndex - last_i) in range(
                                         0, len(datums)) else 'None'
                    features['prev_' + str(last_i) +
                             '_word_is_word_root_be'] = datums[
                                 currentIndex -
                                 last_i].features['is_word_root_be'] if (
                                     currentIndex - last_i) in range(
                                         0, len(datums)) else 'None'
                    features['prev_' + str(last_i) +
                             '_word_is_punct_comma'] = datums[
                                 currentIndex -
                                 last_i].features['is_punct_comma'] if (
                                     currentIndex - last_i) in range(
                                         0, len(datums)) else 'None'
                    features['prev_' + str(last_i) +
                             '_word_word_with_digits'] = datums[
                                 currentIndex -
                                 last_i].features['word_with_digits'] if (
                                     currentIndex - last_i) in range(
                                         0, len(datums)) else 'None'
                    features['prev_' + str(last_i) +
                             '_word_last_2_letters'] = datums[
                                 currentIndex -
                                 last_i].features['last_2_letters'] if (
                                     currentIndex - last_i) in range(
                                         0, len(datums)) else 'None'
                    features['prev_' + str(last_i) +
                             '_word_type_indicator'] = datums[
                                 currentIndex -
                                 last_i].features['type_indicator'] if (
                                     currentIndex - last_i) in range(
                                         0, len(datums)) else 'None'
                    features['prev_' + str(last_i) +
                             '_word_is_orgKey'] = datums[
                                 currentIndex -
                                 last_i].features['is_orgKey'] if (
                                     currentIndex - last_i) in range(
                                         0, len(datums)) else 'None'
                    features['prev_' + str(last_i) +
                             '_word_is_locKey'] = datums[
                                 currentIndex -
                                 last_i].features['is_locKey'] if (
                                     currentIndex - last_i) in range(
                                         0, len(datums)) else 'None'
                    features['prev_' + str(last_i) +
                             '_word_is_country'] = datums[
                                 currentIndex -
                                 last_i].features['is_country'] if (
                                     currentIndex - last_i) in range(
                                         0, len(datums)) else 'None'
                    features['prev_' + str(last_i) +
                             '_word_is_countryAdj'] = datums[
                                 currentIndex -
                                 last_i].features['is_countryAdj'] if (
                                     currentIndex - last_i) in range(
                                         0, len(datums)) else 'None'
                    features['prev_' + str(last_i) +
                             '_word_is_personName'] = datums[
                                 currentIndex -
                                 last_i].features['is_personName'] if (
                                     currentIndex - last_i) in range(
                                         0, len(datums)) else 'None'
                    features['prev_' + str(last_i) +
                             '_word_is_personTitle'] = datums[
                                 currentIndex -
                                 last_i].features['is_personTitle'] if (
                                     currentIndex - last_i) in range(
                                         0, len(datums)) else 'None'
                    features['prev_' + str(last_i) +
                             '_word_is_facKey'] = datums[
                                 currentIndex -
                                 last_i].features['is_facKey'] if (
                                     currentIndex - last_i) in range(
                                         0, len(datums)) else 'None'

            datum.features = features
            currentIndex += 1
            datums.append(datum)

        #add features about next words
        #reset to 0
        currentIndex = 0
        for tagged_word in tagged_context:
            for next_i in range(1, sliding_window_next_n_words + 1):
                if ((currentIndex + next_i) == len(datums)):
                    datums[currentIndex].features['next_word'] = "<END>"

                if (currentIndex + next_i) != len(datums):
                    #datums[currentIndex].features['next_'+str(next_i)+'_word']=datums[currentIndex+next_i].features['word'] if (currentIndex+next_i) in range(0,len(datums)) else 'None'
                    datums[currentIndex].features[
                        'next_' + str(next_i) +
                        '_word_pos'] = datums[currentIndex +
                                              next_i].features['word_pos'] if (
                                                  currentIndex +
                                                  next_i) in range(
                                                      0,
                                                      len(datums)) else 'None'

                    datums[currentIndex].features[
                        'next_' + str(next_i) + '_word_is_StopWord'] = datums[
                            currentIndex + next_i].features['is_StopWord'] if (
                                currentIndex +
                                next_i) in range(0, len(datums)) else 'None'
                    datums[currentIndex].features[
                        'next_' + str(next_i) + '_word_is_Entity'] = datums[
                            currentIndex + next_i].features['is_Entity'] if (
                                currentIndex +
                                next_i) in range(0, len(datums)) else 'None'

                    datums[currentIndex].features[
                        'next_' + str(next_i) + '_word_is_title'] = datums[
                            currentIndex + next_i].features['is_title'] if (
                                currentIndex +
                                next_i) in range(0, len(datums)) else 'None'
                    datums[currentIndex].features[
                        'next_' + str(next_i) + '_word_all_capital'] = datums[
                            currentIndex + next_i].features['all_capital'] if (
                                currentIndex +
                                next_i) in range(0, len(datums)) else 'None'
                    datums[currentIndex].features[
                        'next_' + str(next_i) +
                        '_word_is_word_root_be'] = datums[
                            currentIndex +
                            next_i].features['is_word_root_be'] if (
                                currentIndex +
                                next_i) in range(0, len(datums)) else 'None'
                    datums[currentIndex].features[
                        'next_' + str(next_i) +
                        '_word_is_punct_comma'] = datums[
                            currentIndex +
                            next_i].features['is_punct_comma'] if (
                                currentIndex +
                                next_i) in range(0, len(datums)) else 'None'
                    datums[currentIndex].features[
                        'next_' + str(next_i) +
                        '_word_word_with_digits'] = datums[
                            currentIndex +
                            next_i].features['word_with_digits'] if (
                                currentIndex +
                                next_i) in range(0, len(datums)) else 'None'
                    datums[currentIndex].features[
                        'next_' + str(next_i) +
                        '_word_last_2_letters'] = datums[
                            currentIndex +
                            next_i].features['last_2_letters'] if (
                                currentIndex +
                                next_i) in range(0, len(datums)) else 'None'
                    datums[currentIndex].features[
                        'next_' + str(next_i) +
                        '_word_type_indicator'] = datums[
                            currentIndex +
                            next_i].features['type_indicator'] if (
                                currentIndex +
                                next_i) in range(0, len(datums)) else 'None'
                    datums[currentIndex].features[
                        'next_' + str(next_i) + '_word_is_orgKey'] = datums[
                            currentIndex + next_i].features['is_orgKey'] if (
                                currentIndex +
                                next_i) in range(0, len(datums)) else 'None'
                    datums[currentIndex].features[
                        'next_' + str(next_i) + '_word_is_locKey'] = datums[
                            currentIndex + next_i].features['is_locKey'] if (
                                currentIndex +
                                next_i) in range(0, len(datums)) else 'None'
                    datums[currentIndex].features[
                        'next_' + str(next_i) + '_word_is_country'] = datums[
                            currentIndex + next_i].features['is_country'] if (
                                currentIndex +
                                next_i) in range(0, len(datums)) else 'None'
                    datums[currentIndex].features[
                        'next_' + str(next_i) +
                        '_word_is_countryAdj'] = datums[
                            currentIndex +
                            next_i].features['is_countryAdj'] if (
                                currentIndex +
                                next_i) in range(0, len(datums)) else 'None'
                    datums[currentIndex].features[
                        'next_' + str(next_i) +
                        '_word_is_personName'] = datums[
                            currentIndex +
                            next_i].features['is_personName'] if (
                                currentIndex +
                                next_i) in range(0, len(datums)) else 'None'
                    datums[currentIndex].features[
                        'next_' + str(next_i) +
                        '_word_is_personTitle'] = datums[
                            currentIndex +
                            next_i].features['is_personTitle'] if (
                                currentIndex +
                                next_i) in range(0, len(datums)) else 'None'
                    datums[currentIndex].features[
                        'next_' + str(next_i) + '_word_is_facKey'] = datums[
                            currentIndex + next_i].features['is_facKey'] if (
                                currentIndex +
                                next_i) in range(0, len(datums)) else 'None'
            currentIndex += 1

        return datums