Exemplo n.º 1
0
 def getCost(config, transType=None):
     sent = config.sent
     cost = 0
     for vmwe in sent.vMWEs:
         if vmwe.isInterleaving or vmwe.In(sent.identifiedVMWEs):
             continue
         s0BelongsToVmwe = True
         increaseCost = False
         for token in Sentence.getTokens(config.stack[-1]):
             if not token.In(vmwe):
                 s0BelongsToVmwe = False
         if len(vmwe.tokens) == 1 and s0BelongsToVmwe:
             if isinstance(config.stack[-1], Token):
                 cost += 1
                 continue
             elif isinstance(config.stack[-1], list) and len(config.stack[-1]) == 1:
                 if len(config.stack[-1][0].parentMWEs) == 1:
                     return 0
                 else:
                     cost += 1
         if not s0BelongsToVmwe:
             continue
         for s in config.stack[:-1]:
             for token in Sentence.getTokens(s):
                 if token.In(vmwe):
                     increaseCost = True
                     break
         if increaseCost:
             cost += 1
         else:
             for b in config.buffer:
                 if b.In(vmwe):
                     cost += 1
                     break
     return cost
Exemplo n.º 2
0
 def getCost(config, transType=None):
     sent = config.sent
     cost = 0
     for vmwe in sent.vMWEs:
         if vmwe.isInterleaving or vmwe.In(sent.identifiedVMWEs):
             continue
         b0BelongsToVmwe = config.buffer[0].In(vmwe)
         if not b0BelongsToVmwe:
             continue
         s0BelongsToVmwe = True
         if len(config.stack) > 0:
             for token in Sentence.getTokens(config.stack[-1]):
                 if not token.In(vmwe):
                     s0BelongsToVmwe = False
             if len(config.stack) > 1:
                 otherStackElementsBelongtoVmwe = False
                 for sElem in config.stack[:-1]:
                     for token in Sentence.getTokens(sElem):
                         if token.In(vmwe):
                             otherStackElementsBelongtoVmwe = True
                             break
                         if otherStackElementsBelongtoVmwe:
                             break
             else:
                 otherStackElementsBelongtoVmwe = False
         else:
             s0BelongsToVmwe = False
             otherStackElementsBelongtoVmwe = False
         if not s0BelongsToVmwe and otherStackElementsBelongtoVmwe:
             cost += 1
     return cost
Exemplo n.º 3
0
 def generateDisconinousFeatures(configuration, sent, transDic):
     tokens = Sentence.getTokens([configuration.stack[-1]])
     tokenTxt = Sentence.getTokenLemmas(tokens)
     for key in Corpus.mweDictionary.keys():
         if tokenTxt in key and tokenTxt != key:
             bufidx = 0
             for bufElem in configuration.buffer[:5]:
                 if bufElem.lemma != '' and (
                                 (tokenTxt + ' ' + bufElem.lemma) in key or (bufElem.lemma + ' ' + tokenTxt) in key):
                     transDic['S0B' + str(bufidx) + 'ArePartsOfMWE'] = True
                     transDic['S0B' + str(bufidx) + 'ArePartsOfMWEDistance'] = sent.tokens.index(
                         bufElem) - sent.tokens.index(tokens[-1])
                 bufidx += 1
             break
Exemplo n.º 4
0
    def apply(self, parent, sent, vMWEId=None, parse=False, vMWEType=None, mwtMerge=False):
        Counters.blackMergeNum += 1
        if sent and not parse:
            sent.blackMergeNum += 1
        config = parent.configuration
        newBuffer = list(config.buffer)
        if mwtMerge:
            newStack = list(config.stack)[:-1]
            newStack.append([config.stack[-1]])
        else:
            newStack = list(config.stack)[:-2]
            newStack.append([config.stack[-2], config.stack[-1]])
        newTokens = list(config.tokens)
        vMWETokens = Sentence.getTokens(newStack[-1])
        if len(vMWETokens) > 1 or (len(vMWETokens) == 1 and mwtMerge):
            if vMWEId is None:
                vMWEId = VMWE.getVMWENumber(newTokens) + 1
            vMWE = VMWE(vMWEId, vMWETokens[0])
            if parse:
                sent.identifiedVMWEs.append(vMWE)
            vMWE.tokens = vMWETokens
            if vMWEType is not None:
                vMWE.type = vMWEType
            newTokens.append(vMWE)
        elif len(vMWETokens) == 1:
            newTokens.append(vMWETokens[0])

        newConfig = Configuration(stack=newStack, buffer=newBuffer, tokens=newTokens, sent=sent, transition=self)

        super(BlackMerge, self).__init__(config=newConfig, previous=parent, sent=sent)
Exemplo n.º 5
0
    def getCost(config, transType=None):
        sent = config.sent
        cost = 0
        if isinstance(config.stack[-1], list):
            for vmwe in sent.vMWEs:
                if vmwe.isInterleaving or vmwe.In(sent.identifiedVMWEs):
                    continue
                allInList, someInList = True, False
                stackTokens = Sentence.getTokens(config.stack[-1])
                for t in stackTokens:
                    if not t.In(vmwe):
                        allInList = False
                    else:
                        someInList = True
                if len(stackTokens) == len(vmwe.tokens) and allInList:
                    return 0
                else:
                    allInList = False
                if not allInList and someInList:
                    cost += 1

        elif isinstance(config.stack[-1], Token):
            if config.stack[-1].parentMWEs is None or len(config.stack[-1].parentMWEs) == 0:
                return 0
            else:
                for vmwe in config.stack[-1].parentMWEs:
                    if config.stack[-1].In(vmwe) and len(vmwe.tokens) > 1:
                        cost += 1
                return cost
        return cost
Exemplo n.º 6
0
 def checkForVMWE(transition):
     config = transition.configuration
     sent = config.sent
     # Check up for a possible COMPLETE of MWE after a MERGE transition
     if transition.type == TransitionType.MERGE:
         if len(config.stack) == 1 and isinstance(config.stack[0], list):
             vMWE = None
             parents = []
             tokens = Sentence.getTokens(config.stack[0])
             for token in tokens:
                 if len(token.parentMWEs) == 1:
                     vMWE = token.parentMWEs[0]
                     break
                 for parent in token.parentMWEs:
                     if parent not in parents:
                         parents.append(parent)
             if vMWE is None:
                 for parent in parents:
                     for token in tokens:
                         if parent not in token.parentMWEs:
                             parents.remove(parent)
                 if len(parents) > 1:
                     for parent in parents:
                         if parent.isInterleaving or parent.isEmbedded:
                             parents.remove(parent)
                 vMWE = parents[0]
             if vMWE is not None and len(vMWE.tokens) == len(tokens):
                 complete = Complete(sent=sent)
                 complete.apply(transition, sent, vMWEId=vMWE.id)
                 return complete
     return None
Exemplo n.º 7
0
    def getCost(config, transType=None):
        sent = config.sent
        cost = 0
        for vmwe in sent.vMWEs:
            if vmwe.isInterleaving or vmwe.In(sent.identifiedVMWEs):
                continue

            s0BelongsToVmwe = True
            s1BelongsToVmwe = True
            increaseCost = False

            for token in Sentence.getTokens(config.stack[-1]):
                if not token.In(vmwe):
                    s0BelongsToVmwe = False
            if len(vmwe.tokens) == 1 and s0BelongsToVmwe and isinstance(config.stack[-1], Token):
                cost += 1
                continue

            for token in Sentence.getTokens(config.stack[-2]):
                if not token.In(vmwe):
                    s1BelongsToVmwe = False

            if (s0BelongsToVmwe and s1BelongsToVmwe) and len(config.stack) > 1 and len(vmwe.tokens) == len(
                    Sentence.getTokens(config.stack[-2:])):
                cost += 1
                continue

            if (s0BelongsToVmwe and s1BelongsToVmwe) or (not s0BelongsToVmwe and not s1BelongsToVmwe):
                continue
            if len(config.stack) > 2:
                for stackElement in config.stack[:-2]:
                    for token in Sentence.getTokens(stackElement):
                        if token.In(vmwe):
                            increaseCost = True
                            break
                    if increaseCost:
                        break
            if increaseCost:
                cost += 1
                continue
            for b in config.buffer:
                if b.In(vmwe):
                    cost += 1
                    break
        return cost
Exemplo n.º 8
0
    def check(transition):
        config = transition.configuration
        sent = config.sent
        # Check up of a possible MERGE
        if len(config.stack) > 1:

            s0Tokens = Sentence.getTokens(config.stack[-1])
            s1Tokens = Sentence.getTokens(config.stack[-2])
            # #TODO getParent MWE for WHite merge
            tokens = s1Tokens + s0Tokens
            # tokens = Sentence.getTokens(config.stack)
            selectedParents = VMWE.getParents(tokens)
            if selectedParents and len(selectedParents) > 1:
                reports.annotationReport += str(sent)
            if selectedParents and len(selectedParents) == 1:

                selectedParent = selectedParents[0]
                if selectedParent.type is not None and selectedParent.type != '':
                    if selectedParent.type.lower() == 'id':
                        merge = MergeAsID(sent=sent)
                    elif selectedParent.type.lower() == 'ireflv':
                        merge = MergeAsIReflV(sent=sent)
                    elif selectedParent.type.lower() == 'lvc':
                        merge = MergeAsLVC(sent=sent)
                    elif selectedParent.type.lower() == 'vpc':
                        merge = MergeAsVPC(sent=sent)
                    else:
                        merge = MergeAsOTH(sent=sent)
                else:
                    merge = MergeAsOTH(sent=sent)
                merge.apply(transition, sent=sent)
                return merge
            # selectedParents = VMWE.getSharedVMWEs(Sentence.getTokens(config.stack))
            # if selectedParents and len(selectedParents) > 1:
            #     reports.annotationReport += str(sent)
            selectedParents = VMWE.haveSameParents(tokens)
            if selectedParents and len(selectedParents) == 1:
                if selectedParents[0].tokens[-1] == tokens[-1]:
                    # if len(config.stack) > 2:
                    merge = WhiteMerge(sent=sent)
                    merge.apply(transition, sent)
                    return merge

        return None
Exemplo n.º 9
0
 def check(transition):
     config = transition.configuration
     sent = config.sent
     # Check up of a possible MERGE
     if len(config.stack) > 1:
         tokens = Sentence.getTokens(config.stack)
         selectedParents = VMWE.getParents(tokens)
         if selectedParents and len(selectedParents) == 1 and not selectedParents[0].isEmbedded \
                 and not selectedParents[0].isInterleaving:
             merge = Merge(sent=sent)
             merge.apply(transition, sent)
             return merge
     return None
Exemplo n.º 10
0
    def getCost(config, transType=None):
        sent = config.sent
        cost = 0
        for vmwe in sent.vMWEs:
            if vmwe.isInterleaving or vmwe.In(sent.identifiedVMWEs):
                continue
            s0BelongsToVmwe = True
            s1BelongsToVmwe = True

            for token in Sentence.getTokens(config.stack[-1]):
                if not token.In(vmwe):
                    s0BelongsToVmwe = False
                    break

            for token in Sentence.getTokens(config.stack[-2]):
                if not token.In(vmwe):
                    s1BelongsToVmwe = False
                    break

            if s0BelongsToVmwe and s1BelongsToVmwe:
                return 0
            else:
                cost += 1
        return cost
Exemplo n.º 11
0
    def getCost(config, transType=None, type=None):
        sent = config.sent
        cost = 0
        for vmwe in sent.vMWEs:
            if vmwe.isInterleaving or vmwe.In(sent.identifiedVMWEs):
                continue
            if (config.stack[-1]).In(vmwe) and len(vmwe.tokens) == 1 and vmwe.type.lower() == type.lower():
                return 0
            if (config.stack[-1]).In(vmwe) and len(vmwe.tokens) > 1:
                cost += 1
                continue

        # Precision score:
        vmwes = VMWE.getParents(Sentence.getTokens(config.stack[-1]), type.lower())
        if not vmwes:
            cost += 1
        return cost
Exemplo n.º 12
0
 def concatenateTokens(tokens):
     idx = 0
     tokenDic = {}
     result = []
     for token in tokens:
         if isinstance(token, Token):
             result.append(Token(-1, token.text, token.lemma, token.posTag))
         elif isinstance(token, list):
             tokenDic[idx] = Token(-1, '', '', '')
             for subToken in Sentence.getTokens(token):
                 tokenDic[idx].text += subToken.text + '_'
                 tokenDic[idx].lemma += subToken.lemma + '_'
                 tokenDic[idx].posTag += subToken.posTag + '_'
             tokenDic[idx].text = tokenDic[idx].text[:-1]
             tokenDic[idx].lemma = tokenDic[idx].lemma[:-1]
             tokenDic[idx].posTag = tokenDic[idx].posTag[:-1]
             result.append(tokenDic[idx])
         idx += 1
     return result
Exemplo n.º 13
0
    def check(parent):
        config = parent.configuration
        sent = config.sent

        reduce = Reduce(sent=sent)

        stackWithTopTokenWitoutParents = config.stack and isinstance(config.stack[-1], Token) and (
            not config.stack[-1].parentMWEs)
        if stackWithTopTokenWitoutParents:
            reduce.apply(parent, sent)
            return reduce

        empyBufferWithFullStack = not config.buffer and config.stack
        if empyBufferWithFullStack:
            reduce.apply(parent, sent)
            return reduce

        stackWithMWT = config.stack and isinstance(config.stack[-1], list) and len(config.stack[-1]) == 1 and \
                       config.stack[-1][0].parentMWEs == 1
        if stackWithMWT:
            reduce.apply(parent, sent)
            return reduce

        stackWithSingleListWitOneSharedParentOnly = False
        if config.stack and isinstance(config.stack[-1], list):
            tokens = Sentence.getTokens(config.stack[-1])
            if len(VMWE.getParents(tokens)) == 1 and not VMWE.getParents(tokens)[0].isEmbedded:
                stackWithSingleListWitOneSharedParentOnly = True

        if stackWithSingleListWitOneSharedParentOnly:
            reduce.apply(parent, sent)
            return reduce

        stackWithTopTokenOfInterleavingMWE = sent.containsInterleaving and config.stack and isinstance(config.stack[-1],
                                                                                                       Token) and (
                                                 config.stack[-1].parentMWEs and len(
                                                     config.stack[-1].parentMWEs) == 1 and
                                                 config.stack[-1].parentMWEs[0].isInterleaving)

        if stackWithTopTokenOfInterleavingMWE:
            reduce.apply(parent, sent)
            return reduce
        return None
Exemplo n.º 14
0
 def getCost(config, transType=None):
     sent = config.sent
     cost = 0
     for vmwe in sent.vMWEs:
         if vmwe.isInterleaving or vmwe.In(sent.identifiedVMWEs):
             continue
         s0BelongsToVmwe = True
         s1BelongsToVmwe = True
         increaseCost = False
         for token in Sentence.getTokens(config.stack[-1]):
             if not token.In(vmwe):
                 s0BelongsToVmwe = False
         if len(vmwe.tokens) == 1 and s0BelongsToVmwe and isinstance(config.stack[-1], Token):
             cost += 1
             continue
         for token in Sentence.getTokens(config.stack[-2]):
             if not token.In(vmwe):
                 s1BelongsToVmwe = False
         if (s0BelongsToVmwe and not s1BelongsToVmwe) or (not s0BelongsToVmwe and s1BelongsToVmwe):
             cost += 1
             continue
         if s0BelongsToVmwe and s1BelongsToVmwe:
             if len(Sentence.getTokens(config.stack[-1])) + len(Sentence.getTokens(config.stack[-2])) == len(
                     vmwe.tokens) and transType.name[7:].lower() == vmwe.type.lower():
                 return 0
             if len(config.stack) > 2:
                 for stackElement in config.stack[:-2]:
                     for token in Sentence.getTokens(stackElement):
                         if token.In(vmwe):
                             increaseCost = True
                             break
                     if increaseCost:
                         break
                 if increaseCost:
                     cost += 1
                     continue
             for b in config.buffer:
                 if b.In(vmwe):
                     cost += 1
                     break
     # Precision score:
     correctlyIdentified = False
     vmwes = VMWE.getParents(Sentence.getTokens(config.stack[-2:]))
     if vmwes:
         for vmwe in vmwes:
             if vmwe.type.lower() in str(transType.name).lower():
                 correctlyIdentified = True
     if not correctlyIdentified:
         cost += 1
     return cost
Exemplo n.º 15
0
    def apply(self, parent, sent, vMWEId=None, parse=False):
        Counters.completeNum += 1
        config = parent.configuration
        newBuffer = list(config.buffer)
        newStack = list(config.stack)
        vMWETokens = Sentence.getTokens(newStack[0])
        newStack = newStack[:-1]
        newTokens = list(config.tokens)
        if len(vMWETokens) > 1:
            if sent is not None and not parse:
                sent.blackMergeNum += 1

            if vMWEId is None:
                vMWEId = VMWE.getVMWENumber(newTokens) + 1
            vMWE = VMWE(vMWEId, vMWETokens[0])
            if parse:
                sent.identifiedVMWEs.append(vMWE)
            vMWE.tokens = vMWETokens
            newTokens.append(vMWE)
        elif len(vMWETokens) == 1:
            newTokens.append(vMWETokens[0])
        newConfig = Configuration(stack=newStack, buffer=newBuffer, tokens=newTokens, sent=sent, transition=self)

        super(Complete, self).__init__(config=newConfig, previous=parent, sent=sent)
Exemplo n.º 16
0
 def areInLexic(tokensList):
     if Sentence.getTokenLemmas(tokensList) in Corpus.mweDictionary.keys():
         return True
     return False
Exemplo n.º 17
0
 def loadFromSummitCorpus(self, dir):
     ''' Reads the several files inside the dir and loads the semantic and 
        syntactic information '''
     base_name = dir[dir.rfind('/')+1:len(dir)] 
     pos_file_addr = dir + '/' + base_name + '.txt.pos.xml'
     word_file_addr = dir + '/' +  base_name +'.txt.words.xml'
     chunks_file_addr = dir + '/' + base_name +'.txt.chunks.xml'
     rst_file_addr = dir + '/' + base_name +'.rs3'
     
     ''' Reads both rst_file, word_file and pos_file to build the data structure '''
     try:
         rst_file = minidom.parse(rst_file_addr)
     except:
         print rst_file_addr
     word_file = minidom.parse(word_file_addr)
     pos_file = minidom.parse(pos_file_addr)
     chunks_file = minidom.parse(chunks_file_addr)
     
     chunks = Discourse.getSyntFromChunks(chunks_file)
     
     sentences = rst_file.getElementsByTagName('body')[0].getElementsByTagName('segment')
     words = word_file.getElementsByTagName('words')[0].getElementsByTagName('word')
     words_pos = pos_file.getElementsByTagName('words')[0].getElementsByTagName('word')                
     i = 0
     ''' Counter for the sentences '''
     j = 0
     for tmp_sen in sentences:            
         sentence = Sentence()
         sentence.index = j  
         ''' Set RST_Node id and its related node id '''
         if tmp_sen.attributes.has_key('id'):
             sentence.id = tmp_sen.attributes['id'].value
         if tmp_sen.attributes.has_key('parent'):
             sentence.rel_id = tmp_sen.attributes['parent'].value
         if tmp_sen.attributes.has_key('relname'):
             sentence.rel_name = tmp_sen.attributes['relname'].value
                                
         sen_tokens = extractIgnoredChars(extractSpecialHTMLChars(tmp_sen.childNodes[0].toxml())).lower().split(' ')
         sen_tokens = expand_Contractions(sen_tokens)
         
         word_to_compare = ''
         rebuild_contraction = False
         prep = ''            
         ''' Not all of the representations forms are either known or easy-to-handle
             so we implemented a kind of "tolerance limit". This way, if a word in RST
             file doesn't match to it's respective in words file, then we still can jump
             to the following word until the tolerance is reach.'''
         mismatch_tolerance = 3 
         
         
         for tmp_word in sen_tokens:                
                 
             
             if tmp_word == ' ' or tmp_word == '':
                 continue
             
             ''' Ignores special characters '''
             if words[i].childNodes[0].toxml() in TOK_IGNORED_CHARS:
                 i += 1                                        
             
             ''' A contraction may be inside a noun phrase. If so,
                 then it will be necessary to rebuild it to compare ...
              '''
             if rebuild_contraction:
                 cont = isContractionPair(prep, tmp_word)
                 if cont != False:
                     if pos_underscore < 0:
                         word_to_compare = cont
                     else:
                         word_to_compare = word_to_compare[0:pos_underscore] + '_' + cont                       
                 else:
                     word_to_compare += '_' + tmp_word
                 rebuild_contraction = False
             else:
                 word_to_compare += tmp_word                 
                             
             ''' If the word inside the sentence is equal to 
             the word in the words file, then add it to the sentence structure'''                
             tmp_word2 = replaceChars(words[i].childNodes[0].toxml().lower(), 
                                      [',', '.', '?', '"', '!', '=', ':', '-', '\'', '\n', ';'], '')
             if word_to_compare == tmp_word2:
                 el = findDOMElementById(words_pos, words[i].attributes['id'].value)
                 if el != False:                                        
                     word = Discourse.loadWordFromPosFile(el)
                 else:
                     word = Discourse.loadWordFromPosFile(None)
                 
                 if words[i].attributes.has_key('ref'):
                     word.properties['ref'] = words[i].attributes['ref'] 
                 
                 word.properties['text'] = tmp_word2
                 word.properties['id'] = words[i].attributes['id'].value
                                             
                 if chunks.has_key(words[i].attributes['id'].value):
                     word.properties['synt'] = chunks[words[i].attributes['id'].value]                        
                     
                 ''' Stores the referent (if there is one)'''
                 if words[i].attributes.has_key('ref'):
                     word.properties['ref'] = words[i].attributes['ref'].value
                     
                 sentence.words.append(word)
                 word.sentence = sentence
                 i += 1
                 word_to_compare = ''
             else:
                 
                 '''May be a contraction '''
                 pos_underscore = word_to_compare.rfind('_')
                 prep = word_to_compare[pos_underscore+1:len(word_to_compare)]
                 is_cont = isContractionPrep(prep)
                 
                 '''It may be some noun phrase ... '''
                 if word_to_compare in tmp_word2 or is_cont:
                     '''May be a contraction '''                        
                     if is_cont:
                         rebuild_contraction = True
                     else:
                         word_to_compare += '_'                 
                 else:
                     if mismatch_tolerance > 0:
                         mismatch_tolerance -= 1
                         el = findDOMElementById(words_pos, words[i].attributes['id'].value)
                         if el != False:                                        
                             word = Discourse.loadWordFromPosFile(el)
                         else:
                             word = Discourse.loadWordFromPosFile(None)
                             
                         if words[i].attributes.has_key('ref'):
                             word.properties['ref'] = words[i].attributes['ref'].value
                             
                         word.properties['text'] = tmp_word2
                         word.properties['id'] = words[i].attributes['id'].value
                         if chunks.has_key(words[i].attributes['id'].value):
                             word.properties['synt'] = chunks[words[i].attributes['id'].value]
                         sentence.words.append(word)
                         word.sentence = sentence
                         i += 1
                         word_to_compare = ''                                        
         
         self.sentences.append(sentence)
         j += 1
         
     self.parse_tree = Tree()
     self.parse_tree.loadFromSumitCorpus(chunks_file, self)
     
     self.rst_tree = RST_Tree(self)
     self.rst_tree.loadFromSumitCorpus(rst_file, self)