def getCost(config, transType=None): sent = config.sent cost = 0 for vmwe in sent.vMWEs: if vmwe.isInterleaving or vmwe.In(sent.identifiedVMWEs): continue s0BelongsToVmwe = True increaseCost = False for token in Sentence.getTokens(config.stack[-1]): if not token.In(vmwe): s0BelongsToVmwe = False if len(vmwe.tokens) == 1 and s0BelongsToVmwe: if isinstance(config.stack[-1], Token): cost += 1 continue elif isinstance(config.stack[-1], list) and len(config.stack[-1]) == 1: if len(config.stack[-1][0].parentMWEs) == 1: return 0 else: cost += 1 if not s0BelongsToVmwe: continue for s in config.stack[:-1]: for token in Sentence.getTokens(s): if token.In(vmwe): increaseCost = True break if increaseCost: cost += 1 else: for b in config.buffer: if b.In(vmwe): cost += 1 break return cost
def getCost(config, transType=None): sent = config.sent cost = 0 for vmwe in sent.vMWEs: if vmwe.isInterleaving or vmwe.In(sent.identifiedVMWEs): continue b0BelongsToVmwe = config.buffer[0].In(vmwe) if not b0BelongsToVmwe: continue s0BelongsToVmwe = True if len(config.stack) > 0: for token in Sentence.getTokens(config.stack[-1]): if not token.In(vmwe): s0BelongsToVmwe = False if len(config.stack) > 1: otherStackElementsBelongtoVmwe = False for sElem in config.stack[:-1]: for token in Sentence.getTokens(sElem): if token.In(vmwe): otherStackElementsBelongtoVmwe = True break if otherStackElementsBelongtoVmwe: break else: otherStackElementsBelongtoVmwe = False else: s0BelongsToVmwe = False otherStackElementsBelongtoVmwe = False if not s0BelongsToVmwe and otherStackElementsBelongtoVmwe: cost += 1 return cost
def generateDisconinousFeatures(configuration, sent, transDic): tokens = Sentence.getTokens([configuration.stack[-1]]) tokenTxt = Sentence.getTokenLemmas(tokens) for key in Corpus.mweDictionary.keys(): if tokenTxt in key and tokenTxt != key: bufidx = 0 for bufElem in configuration.buffer[:5]: if bufElem.lemma != '' and ( (tokenTxt + ' ' + bufElem.lemma) in key or (bufElem.lemma + ' ' + tokenTxt) in key): transDic['S0B' + str(bufidx) + 'ArePartsOfMWE'] = True transDic['S0B' + str(bufidx) + 'ArePartsOfMWEDistance'] = sent.tokens.index( bufElem) - sent.tokens.index(tokens[-1]) bufidx += 1 break
def apply(self, parent, sent, vMWEId=None, parse=False, vMWEType=None, mwtMerge=False): Counters.blackMergeNum += 1 if sent and not parse: sent.blackMergeNum += 1 config = parent.configuration newBuffer = list(config.buffer) if mwtMerge: newStack = list(config.stack)[:-1] newStack.append([config.stack[-1]]) else: newStack = list(config.stack)[:-2] newStack.append([config.stack[-2], config.stack[-1]]) newTokens = list(config.tokens) vMWETokens = Sentence.getTokens(newStack[-1]) if len(vMWETokens) > 1 or (len(vMWETokens) == 1 and mwtMerge): if vMWEId is None: vMWEId = VMWE.getVMWENumber(newTokens) + 1 vMWE = VMWE(vMWEId, vMWETokens[0]) if parse: sent.identifiedVMWEs.append(vMWE) vMWE.tokens = vMWETokens if vMWEType is not None: vMWE.type = vMWEType newTokens.append(vMWE) elif len(vMWETokens) == 1: newTokens.append(vMWETokens[0]) newConfig = Configuration(stack=newStack, buffer=newBuffer, tokens=newTokens, sent=sent, transition=self) super(BlackMerge, self).__init__(config=newConfig, previous=parent, sent=sent)
def getCost(config, transType=None): sent = config.sent cost = 0 if isinstance(config.stack[-1], list): for vmwe in sent.vMWEs: if vmwe.isInterleaving or vmwe.In(sent.identifiedVMWEs): continue allInList, someInList = True, False stackTokens = Sentence.getTokens(config.stack[-1]) for t in stackTokens: if not t.In(vmwe): allInList = False else: someInList = True if len(stackTokens) == len(vmwe.tokens) and allInList: return 0 else: allInList = False if not allInList and someInList: cost += 1 elif isinstance(config.stack[-1], Token): if config.stack[-1].parentMWEs is None or len(config.stack[-1].parentMWEs) == 0: return 0 else: for vmwe in config.stack[-1].parentMWEs: if config.stack[-1].In(vmwe) and len(vmwe.tokens) > 1: cost += 1 return cost return cost
def checkForVMWE(transition): config = transition.configuration sent = config.sent # Check up for a possible COMPLETE of MWE after a MERGE transition if transition.type == TransitionType.MERGE: if len(config.stack) == 1 and isinstance(config.stack[0], list): vMWE = None parents = [] tokens = Sentence.getTokens(config.stack[0]) for token in tokens: if len(token.parentMWEs) == 1: vMWE = token.parentMWEs[0] break for parent in token.parentMWEs: if parent not in parents: parents.append(parent) if vMWE is None: for parent in parents: for token in tokens: if parent not in token.parentMWEs: parents.remove(parent) if len(parents) > 1: for parent in parents: if parent.isInterleaving or parent.isEmbedded: parents.remove(parent) vMWE = parents[0] if vMWE is not None and len(vMWE.tokens) == len(tokens): complete = Complete(sent=sent) complete.apply(transition, sent, vMWEId=vMWE.id) return complete return None
def getCost(config, transType=None): sent = config.sent cost = 0 for vmwe in sent.vMWEs: if vmwe.isInterleaving or vmwe.In(sent.identifiedVMWEs): continue s0BelongsToVmwe = True s1BelongsToVmwe = True increaseCost = False for token in Sentence.getTokens(config.stack[-1]): if not token.In(vmwe): s0BelongsToVmwe = False if len(vmwe.tokens) == 1 and s0BelongsToVmwe and isinstance(config.stack[-1], Token): cost += 1 continue for token in Sentence.getTokens(config.stack[-2]): if not token.In(vmwe): s1BelongsToVmwe = False if (s0BelongsToVmwe and s1BelongsToVmwe) and len(config.stack) > 1 and len(vmwe.tokens) == len( Sentence.getTokens(config.stack[-2:])): cost += 1 continue if (s0BelongsToVmwe and s1BelongsToVmwe) or (not s0BelongsToVmwe and not s1BelongsToVmwe): continue if len(config.stack) > 2: for stackElement in config.stack[:-2]: for token in Sentence.getTokens(stackElement): if token.In(vmwe): increaseCost = True break if increaseCost: break if increaseCost: cost += 1 continue for b in config.buffer: if b.In(vmwe): cost += 1 break return cost
def check(transition): config = transition.configuration sent = config.sent # Check up of a possible MERGE if len(config.stack) > 1: s0Tokens = Sentence.getTokens(config.stack[-1]) s1Tokens = Sentence.getTokens(config.stack[-2]) # #TODO getParent MWE for WHite merge tokens = s1Tokens + s0Tokens # tokens = Sentence.getTokens(config.stack) selectedParents = VMWE.getParents(tokens) if selectedParents and len(selectedParents) > 1: reports.annotationReport += str(sent) if selectedParents and len(selectedParents) == 1: selectedParent = selectedParents[0] if selectedParent.type is not None and selectedParent.type != '': if selectedParent.type.lower() == 'id': merge = MergeAsID(sent=sent) elif selectedParent.type.lower() == 'ireflv': merge = MergeAsIReflV(sent=sent) elif selectedParent.type.lower() == 'lvc': merge = MergeAsLVC(sent=sent) elif selectedParent.type.lower() == 'vpc': merge = MergeAsVPC(sent=sent) else: merge = MergeAsOTH(sent=sent) else: merge = MergeAsOTH(sent=sent) merge.apply(transition, sent=sent) return merge # selectedParents = VMWE.getSharedVMWEs(Sentence.getTokens(config.stack)) # if selectedParents and len(selectedParents) > 1: # reports.annotationReport += str(sent) selectedParents = VMWE.haveSameParents(tokens) if selectedParents and len(selectedParents) == 1: if selectedParents[0].tokens[-1] == tokens[-1]: # if len(config.stack) > 2: merge = WhiteMerge(sent=sent) merge.apply(transition, sent) return merge return None
def check(transition): config = transition.configuration sent = config.sent # Check up of a possible MERGE if len(config.stack) > 1: tokens = Sentence.getTokens(config.stack) selectedParents = VMWE.getParents(tokens) if selectedParents and len(selectedParents) == 1 and not selectedParents[0].isEmbedded \ and not selectedParents[0].isInterleaving: merge = Merge(sent=sent) merge.apply(transition, sent) return merge return None
def getCost(config, transType=None): sent = config.sent cost = 0 for vmwe in sent.vMWEs: if vmwe.isInterleaving or vmwe.In(sent.identifiedVMWEs): continue s0BelongsToVmwe = True s1BelongsToVmwe = True for token in Sentence.getTokens(config.stack[-1]): if not token.In(vmwe): s0BelongsToVmwe = False break for token in Sentence.getTokens(config.stack[-2]): if not token.In(vmwe): s1BelongsToVmwe = False break if s0BelongsToVmwe and s1BelongsToVmwe: return 0 else: cost += 1 return cost
def getCost(config, transType=None, type=None): sent = config.sent cost = 0 for vmwe in sent.vMWEs: if vmwe.isInterleaving or vmwe.In(sent.identifiedVMWEs): continue if (config.stack[-1]).In(vmwe) and len(vmwe.tokens) == 1 and vmwe.type.lower() == type.lower(): return 0 if (config.stack[-1]).In(vmwe) and len(vmwe.tokens) > 1: cost += 1 continue # Precision score: vmwes = VMWE.getParents(Sentence.getTokens(config.stack[-1]), type.lower()) if not vmwes: cost += 1 return cost
def concatenateTokens(tokens): idx = 0 tokenDic = {} result = [] for token in tokens: if isinstance(token, Token): result.append(Token(-1, token.text, token.lemma, token.posTag)) elif isinstance(token, list): tokenDic[idx] = Token(-1, '', '', '') for subToken in Sentence.getTokens(token): tokenDic[idx].text += subToken.text + '_' tokenDic[idx].lemma += subToken.lemma + '_' tokenDic[idx].posTag += subToken.posTag + '_' tokenDic[idx].text = tokenDic[idx].text[:-1] tokenDic[idx].lemma = tokenDic[idx].lemma[:-1] tokenDic[idx].posTag = tokenDic[idx].posTag[:-1] result.append(tokenDic[idx]) idx += 1 return result
def check(parent): config = parent.configuration sent = config.sent reduce = Reduce(sent=sent) stackWithTopTokenWitoutParents = config.stack and isinstance(config.stack[-1], Token) and ( not config.stack[-1].parentMWEs) if stackWithTopTokenWitoutParents: reduce.apply(parent, sent) return reduce empyBufferWithFullStack = not config.buffer and config.stack if empyBufferWithFullStack: reduce.apply(parent, sent) return reduce stackWithMWT = config.stack and isinstance(config.stack[-1], list) and len(config.stack[-1]) == 1 and \ config.stack[-1][0].parentMWEs == 1 if stackWithMWT: reduce.apply(parent, sent) return reduce stackWithSingleListWitOneSharedParentOnly = False if config.stack and isinstance(config.stack[-1], list): tokens = Sentence.getTokens(config.stack[-1]) if len(VMWE.getParents(tokens)) == 1 and not VMWE.getParents(tokens)[0].isEmbedded: stackWithSingleListWitOneSharedParentOnly = True if stackWithSingleListWitOneSharedParentOnly: reduce.apply(parent, sent) return reduce stackWithTopTokenOfInterleavingMWE = sent.containsInterleaving and config.stack and isinstance(config.stack[-1], Token) and ( config.stack[-1].parentMWEs and len( config.stack[-1].parentMWEs) == 1 and config.stack[-1].parentMWEs[0].isInterleaving) if stackWithTopTokenOfInterleavingMWE: reduce.apply(parent, sent) return reduce return None
def getCost(config, transType=None): sent = config.sent cost = 0 for vmwe in sent.vMWEs: if vmwe.isInterleaving or vmwe.In(sent.identifiedVMWEs): continue s0BelongsToVmwe = True s1BelongsToVmwe = True increaseCost = False for token in Sentence.getTokens(config.stack[-1]): if not token.In(vmwe): s0BelongsToVmwe = False if len(vmwe.tokens) == 1 and s0BelongsToVmwe and isinstance(config.stack[-1], Token): cost += 1 continue for token in Sentence.getTokens(config.stack[-2]): if not token.In(vmwe): s1BelongsToVmwe = False if (s0BelongsToVmwe and not s1BelongsToVmwe) or (not s0BelongsToVmwe and s1BelongsToVmwe): cost += 1 continue if s0BelongsToVmwe and s1BelongsToVmwe: if len(Sentence.getTokens(config.stack[-1])) + len(Sentence.getTokens(config.stack[-2])) == len( vmwe.tokens) and transType.name[7:].lower() == vmwe.type.lower(): return 0 if len(config.stack) > 2: for stackElement in config.stack[:-2]: for token in Sentence.getTokens(stackElement): if token.In(vmwe): increaseCost = True break if increaseCost: break if increaseCost: cost += 1 continue for b in config.buffer: if b.In(vmwe): cost += 1 break # Precision score: correctlyIdentified = False vmwes = VMWE.getParents(Sentence.getTokens(config.stack[-2:])) if vmwes: for vmwe in vmwes: if vmwe.type.lower() in str(transType.name).lower(): correctlyIdentified = True if not correctlyIdentified: cost += 1 return cost
def apply(self, parent, sent, vMWEId=None, parse=False): Counters.completeNum += 1 config = parent.configuration newBuffer = list(config.buffer) newStack = list(config.stack) vMWETokens = Sentence.getTokens(newStack[0]) newStack = newStack[:-1] newTokens = list(config.tokens) if len(vMWETokens) > 1: if sent is not None and not parse: sent.blackMergeNum += 1 if vMWEId is None: vMWEId = VMWE.getVMWENumber(newTokens) + 1 vMWE = VMWE(vMWEId, vMWETokens[0]) if parse: sent.identifiedVMWEs.append(vMWE) vMWE.tokens = vMWETokens newTokens.append(vMWE) elif len(vMWETokens) == 1: newTokens.append(vMWETokens[0]) newConfig = Configuration(stack=newStack, buffer=newBuffer, tokens=newTokens, sent=sent, transition=self) super(Complete, self).__init__(config=newConfig, previous=parent, sent=sent)
def areInLexic(tokensList): if Sentence.getTokenLemmas(tokensList) in Corpus.mweDictionary.keys(): return True return False
def loadFromSummitCorpus(self, dir): ''' Reads the several files inside the dir and loads the semantic and syntactic information ''' base_name = dir[dir.rfind('/')+1:len(dir)] pos_file_addr = dir + '/' + base_name + '.txt.pos.xml' word_file_addr = dir + '/' + base_name +'.txt.words.xml' chunks_file_addr = dir + '/' + base_name +'.txt.chunks.xml' rst_file_addr = dir + '/' + base_name +'.rs3' ''' Reads both rst_file, word_file and pos_file to build the data structure ''' try: rst_file = minidom.parse(rst_file_addr) except: print rst_file_addr word_file = minidom.parse(word_file_addr) pos_file = minidom.parse(pos_file_addr) chunks_file = minidom.parse(chunks_file_addr) chunks = Discourse.getSyntFromChunks(chunks_file) sentences = rst_file.getElementsByTagName('body')[0].getElementsByTagName('segment') words = word_file.getElementsByTagName('words')[0].getElementsByTagName('word') words_pos = pos_file.getElementsByTagName('words')[0].getElementsByTagName('word') i = 0 ''' Counter for the sentences ''' j = 0 for tmp_sen in sentences: sentence = Sentence() sentence.index = j ''' Set RST_Node id and its related node id ''' if tmp_sen.attributes.has_key('id'): sentence.id = tmp_sen.attributes['id'].value if tmp_sen.attributes.has_key('parent'): sentence.rel_id = tmp_sen.attributes['parent'].value if tmp_sen.attributes.has_key('relname'): sentence.rel_name = tmp_sen.attributes['relname'].value sen_tokens = extractIgnoredChars(extractSpecialHTMLChars(tmp_sen.childNodes[0].toxml())).lower().split(' ') sen_tokens = expand_Contractions(sen_tokens) word_to_compare = '' rebuild_contraction = False prep = '' ''' Not all of the representations forms are either known or easy-to-handle so we implemented a kind of "tolerance limit". This way, if a word in RST file doesn't match to it's respective in words file, then we still can jump to the following word until the tolerance is reach.''' mismatch_tolerance = 3 for tmp_word in sen_tokens: if tmp_word == ' ' or tmp_word == '': continue ''' Ignores special characters ''' if words[i].childNodes[0].toxml() in TOK_IGNORED_CHARS: i += 1 ''' A contraction may be inside a noun phrase. If so, then it will be necessary to rebuild it to compare ... ''' if rebuild_contraction: cont = isContractionPair(prep, tmp_word) if cont != False: if pos_underscore < 0: word_to_compare = cont else: word_to_compare = word_to_compare[0:pos_underscore] + '_' + cont else: word_to_compare += '_' + tmp_word rebuild_contraction = False else: word_to_compare += tmp_word ''' If the word inside the sentence is equal to the word in the words file, then add it to the sentence structure''' tmp_word2 = replaceChars(words[i].childNodes[0].toxml().lower(), [',', '.', '?', '"', '!', '=', ':', '-', '\'', '\n', ';'], '') if word_to_compare == tmp_word2: el = findDOMElementById(words_pos, words[i].attributes['id'].value) if el != False: word = Discourse.loadWordFromPosFile(el) else: word = Discourse.loadWordFromPosFile(None) if words[i].attributes.has_key('ref'): word.properties['ref'] = words[i].attributes['ref'] word.properties['text'] = tmp_word2 word.properties['id'] = words[i].attributes['id'].value if chunks.has_key(words[i].attributes['id'].value): word.properties['synt'] = chunks[words[i].attributes['id'].value] ''' Stores the referent (if there is one)''' if words[i].attributes.has_key('ref'): word.properties['ref'] = words[i].attributes['ref'].value sentence.words.append(word) word.sentence = sentence i += 1 word_to_compare = '' else: '''May be a contraction ''' pos_underscore = word_to_compare.rfind('_') prep = word_to_compare[pos_underscore+1:len(word_to_compare)] is_cont = isContractionPrep(prep) '''It may be some noun phrase ... ''' if word_to_compare in tmp_word2 or is_cont: '''May be a contraction ''' if is_cont: rebuild_contraction = True else: word_to_compare += '_' else: if mismatch_tolerance > 0: mismatch_tolerance -= 1 el = findDOMElementById(words_pos, words[i].attributes['id'].value) if el != False: word = Discourse.loadWordFromPosFile(el) else: word = Discourse.loadWordFromPosFile(None) if words[i].attributes.has_key('ref'): word.properties['ref'] = words[i].attributes['ref'].value word.properties['text'] = tmp_word2 word.properties['id'] = words[i].attributes['id'].value if chunks.has_key(words[i].attributes['id'].value): word.properties['synt'] = chunks[words[i].attributes['id'].value] sentence.words.append(word) word.sentence = sentence i += 1 word_to_compare = '' self.sentences.append(sentence) j += 1 self.parse_tree = Tree() self.parse_tree.loadFromSumitCorpus(chunks_file, self) self.rst_tree = RST_Tree(self) self.rst_tree.loadFromSumitCorpus(rst_file, self)