def __init__(self, scenario, args): """\ Constructor, just checking the argument values. """ Block.__init__(self, scenario, args) if self.language is None: raise LoadingException('Language must be defined!') self.lexicon = Lexicon()
class ImposeSubjPredAgr(ImposeAgreement): """ Impose gender and number agreement of relative pronouns with their antecedent. Arguments: language: the language of the target tree selector: the selector of the target tree """ def __init__(self, scenario, args): "Constructor, checking the argument values" super(ImposeSubjPredAgr, self).__init__(scenario, args) self.lexicon = Lexicon() def should_agree(self, tnode): "Find finite verbs, with/without a subject." # avoid everything except finite verbs if not re.match(r'v.+(fin|rc)$', tnode.formeme): return False anode = tnode.lex_anode asubj = first(lambda achild: achild.afun == 'Sb', anode.get_echildren()) return (anode, asubj) def process_excepts(self, tnode, match_nodes): "Returns False; there are no special cases for this rule." anode, asubj = match_nodes # subjectless verbs, reflexive passive and # incongruent numerals: 3.ps. sg. neut. if (asubj is None and (re.match(r'^((po|z|za)?dařit|(za)?líbit)$', anode.lemma) or (tnode.gram_diathesis or tnode.voice) in ['reflexive_diathesis', 'deagent'])) or \ (asubj and self.lexicon.is_incongruent_numeral(asubj.lemma)): anode.morphcat_gender = 'N' anode.morphcat_number = 'S' anode.morphcat_person = '3' return True # This will skip all verbs without subject if asubj is None: return True # Indefinite pronoun subjects if re.match(r'^((ně|ni|)kdo|kdokoliv?)$', asubj.lemma): anode.morphcat_gender = 'M' anode.morphcat_number = asubj.morphcat_number or 'S' anode.morphcat_person = '3' return True return False def impose(self, tnode, match_nodes): "Impose the subject-predicate agreement on regular nodes." anode, asubj = match_nodes # Copy the categories from the subject to the predicate anode.morphcat_gender = asubj.morphcat_gender anode.morphcat_person = asubj.morphcat_person in ['1', '2', '3'] and \ asubj.morphcat_person or '3' anode.morphcat_number = asubj.morphcat_number # Correct for coordinated subjects if asubj.is_member and asubj.parent.lemma != 'nebo': asubj.morphcat_number = 'P'
class GeneratePossessiveAdjectives(Block): """\ According to formemes, this changes the lemma of the surface possessive adjectives from the original (deep) lemma which was identical to the noun from which the adjective is derived, e.g. changes the a-node lemma from 'Čapek' to 'Čapkův' if the corresponding t-node has the 'adj:poss' formeme. Arguments: language: the language of the target tree selector: the selector of the target tree """ def __init__(self, scenario, args): """\ Constructor, just checking the argument values. """ Block.__init__(self, scenario, args) if self.language is None: raise LoadingException('Language must be defined!') self.lexicon = Lexicon() def load(self): self.lexicon.load_possessive_adj_dict(self.scenario.data_dir) def process_tnode(self, tnode): """\ Check a t-node if its lexical a-node should be changed; if yes, update its lemma. """ # skip all nodes to which this does NOT apply if not re.match(r'^(n|adj):poss$', tnode.formeme) or \ tnode.mlayer_pos == 'P' or tnode.t_lemma == '#PersPron': return anode = tnode.lex_anode poss_adj_lemma = self.lexicon.get_possessive_adj_for(anode.lemma) # the corresponding possessive adjective exists, we can use it if poss_adj_lemma: anode.lemma = poss_adj_lemma anode.morphcat_pos = 'A' anode.morphcat_subpos = '.' anode.morphcat_gender = '.' # this will be obtained via agreement anode.morphcat_number = '.' # if the possessive adjective does not exist, we resort to using # the noun in genitive else: tnode.formeme = 'n:2' anode.morphcat_case = '2'
class AddCoordPunct(Block): """ Add comma to coordinated lists of 3 and more elements, as well as before some Czech coordination conjunctions ('ale', 'ani'). Arguments: language: the language of the target tree selector: the selector of the target tree """ def __init__(self, scenario, args): "Constructor, just checking the argument values" Block.__init__(self, scenario, args) if self.language is None: raise LoadingException('Language must be defined!') self.lexicon = Lexicon() def process_anode(self, anode): "Add coordination punctuation to the given anode, if applicable." if anode.afun != 'Coord': return achildren = anode.get_children(ordered=True) if not achildren: return # add comma before certain conjunctions if self.lexicon.is_coord_conj(anode.lemma) == 'Y' and \ self.is_at_clause_boundary(anode): acomma = self.add_comma_node(anode) acomma.shift_before_node(anode) # add comma in lists with multiple members (before every member # except the first one and the last one, which is connected with # the conjunction) for aprec_member in [ an for an in anode.get_children() if an.is_member and an < anode ][1:]: acomma = self.add_comma_node(anode) acomma.shift_before_subtree(aprec_member) def add_comma_node(self, anode): "Add a comma AuxX node under the given node." return anode.create_child( data={ 'form': ',', 'lemma': ',', 'afun': 'AuxX', 'morphcat': { 'pos': 'Z' }, 'clause_number': 0 }) def is_at_clause_boundary(self, anode): """Return true if the given node is at a clause boundary (i.e. the nodes immediately before and after it belong to different clauses).""" prev_node = anode.get_prev_node() next_node = anode.get_next_node() return prev_node and next_node and \ prev_node.clause_number != next_node.clause_number
class AddCoordPunct(Block): """ Add comma to coordinated lists of 3 and more elements, as well as before some Czech coordination conjunctions ('ale', 'ani'). Arguments: language: the language of the target tree selector: the selector of the target tree """ def __init__(self, scenario, args): "Constructor, just checking the argument values" Block.__init__(self, scenario, args) if self.language is None: raise LoadingException('Language must be defined!') self.lexicon = Lexicon() def process_anode(self, anode): "Add coordination punctuation to the given anode, if applicable." if anode.afun != 'Coord': return achildren = anode.get_children(ordered=True) if not achildren: return # add comma before certain conjunctions if self.lexicon.is_coord_conj(anode.lemma) == 'Y' and \ self.is_at_clause_boundary(anode): acomma = self.add_comma_node(anode) acomma.shift_before_node(anode) # add comma in lists with multiple members (before every member # except the first one and the last one, which is connected with # the conjunction) for aprec_member in [an for an in anode.get_children() if an.is_member and an < anode][1:]: acomma = self.add_comma_node(anode) acomma.shift_before_subtree(aprec_member) def add_comma_node(self, anode): "Add a comma AuxX node under the given node." return anode.create_child(data={'form': ',', 'lemma': ',', 'afun': 'AuxX', 'morphcat': {'pos': 'Z'}, 'clause_number': 0}) def is_at_clause_boundary(self, anode): """Return true if the given node is at a clause boundary (i.e. the nodes immediately before and after it belong to different clauses).""" prev_node = anode.get_prev_node() next_node = anode.get_next_node() return prev_node and next_node and \ prev_node.clause_number != next_node.clause_number
class AddSubconjs(AddAuxWords): """ Add subordinate conjunction a-nodes according to formemes. Arguments: language: the language of the target tree selector: the selector of the target tree """ def __init__(self, scenario, args): "Constructor, just checking the argument values" Block.__init__(self, scenario, args) if self.language is None: raise LoadingException('Language must be defined!') self.lexicon = Lexicon() def get_aux_forms(self, tnode): "Find prepositional nodes to be created." match = re.match(r'^v:(.+)\+', tnode.formeme) if not match: return None # obtain the surface forms of the prepositions return match.group(1).split('_') def new_aux_node(self, anode, form): """\ Create a subordinate conjunction node with the given conjunction form and parent. """ new_node = anode.create_child() # inflect 'aby' and 'kdyby' if form in ['aby', 'kdyby']: new_node.form = self.lexicon.inflect_conditional(form, anode.morphcat_number, anode.morphcat_person) else: new_node.form = form new_node.afun = 'AuxC' new_node.lemma = form new_node.morphcat_pos = 'J' new_node.shift_before_subtree(anode) return new_node
class AddSubconjs(AddAuxWords): """ Add subordinate conjunction a-nodes according to formemes. Arguments: language: the language of the target tree selector: the selector of the target tree """ def __init__(self, scenario, args): "Constructor, just checking the argument values" Block.__init__(self, scenario, args) if self.language is None: raise LoadingException('Language must be defined!') self.lexicon = Lexicon() def get_aux_forms(self, tnode): "Find prepositional nodes to be created." match = re.match(r'^v:(.+)\+', tnode.formeme) if not match: return None # obtain the surface forms of the prepositions return match.group(1).split('_') def new_aux_node(self, anode, form): """\ Create a subordinate conjunction node with the given conjunction form and parent. """ new_node = anode.create_child() # inflect 'aby' and 'kdyby' if form in ['aby', 'kdyby']: new_node.form = self.lexicon.inflect_conditional( form, anode.morphcat_number, anode.morphcat_person) else: new_node.form = form new_node.afun = 'AuxC' new_node.lemma = form new_node.morphcat_pos = 'J' new_node.shift_before_subtree(anode) return new_node
class AddAuxVerbCompoundFuture(Block): """ Add compound future auxiliary 'bude'. Arguments: language: the language of the target tree selector: the selector of the target tree """ def __init__(self, scenario, args): "Constructor, just checking the argument values" Block.__init__(self, scenario, args) if self.language is None: raise LoadingException('Language must be defined!') self.lexicon = Lexicon() def process_tnode(self, tnode): "Add compound future auxiliary to a node, where appropriate." # only future tense + processual aspect or modals if tnode.gram_tense != 'post' or (tnode.gram_aspect != 'proc' and tnode.gram_deontmod == 'decl'): return # skip synthetic future verbs (this also rules out passives) aconj = tnode.get_deref_attr('wild/conjugated') if self.lexicon.has_synthetic_future(aconj.lemma): return # create the new auxiliary node anew_aux = aconj.create_child() anew_aux.shift_before_node(aconj) anew_aux.afun = 'AuxV' anew_aux.lemma = 'být' # move conjugation anew_aux.morphcat = aconj.morphcat aconj.morphcat = {'pos': 'V', 'subpos': 'f'} anew_aux.morphcat_gender = '-' anew_aux.morphcat_tense = 'F' # handle links tnode.set_deref_attr('wild/conjugated', anew_aux) tnode.add_aux_anodes(anew_aux)
class AddAuxVerbConditional(Block): """ Add conditional auxiliary 'by'/'bych'. Arguments: language: the language of the target tree selector: the selector of the target tree """ def __init__(self, scenario, args): "Constructor, just checking the argument values" Block.__init__(self, scenario, args) if self.language is None: raise LoadingException('Language must be defined!') self.lexicon = Lexicon() def process_tnode(self, tnode): "Add conditional auxiliary to a node, where appropriate." # check if we have to add a conditional auxiliary, end if not if tnode.gram_verbmod != 'cdn' or re.search(r'(aby|kdyby)', tnode.formeme): return aconj = tnode.get_deref_attr('wild/conjugated') # create the new node if aconj.afun == 'AuxV': # auxiliary conjugated -> make it a sibling acdn = aconj.parent.create_child() else: # normal verb conjugated -> make it a child acdn = aconj.create_child() acdn.shift_before_node(aconj) acdn.lemma = 'být' acdn.afun = 'AuxV' acdn.morphcat_pos = 'V' acdn.morphcat_subpos = 'c' acdn.form = self.lexicon.inflect_conditional('by', aconj.morphcat_number, aconj.morphcat_person) # set tense of the original to past aconj.morphcat_subpos = 'p' # fix links tnode.add_aux_anodes(acdn)
class AddAuxVerbConditional(Block): """ Add conditional auxiliary 'by'/'bych'. Arguments: language: the language of the target tree selector: the selector of the target tree """ def __init__(self, scenario, args): "Constructor, just checking the argument values" Block.__init__(self, scenario, args) if self.language is None: raise LoadingException("Language must be defined!") self.lexicon = Lexicon() def process_tnode(self, tnode): "Add conditional auxiliary to a node, where appropriate." # check if we have to add a conditional auxiliary, end if not if tnode.gram_verbmod != "cdn" or re.search(r"(aby|kdyby)", tnode.formeme): return aconj = tnode.get_deref_attr("wild/conjugated") # create the new node if aconj.afun == "AuxV": # auxiliary conjugated -> make it a sibling acdn = aconj.parent.create_child() else: # normal verb conjugated -> make it a child acdn = aconj.create_child() acdn.shift_before_node(aconj) acdn.lemma = "být" acdn.afun = "AuxV" acdn.morphcat_pos = "V" acdn.morphcat_subpos = "c" acdn.form = self.lexicon.inflect_conditional("by", aconj.morphcat_number, aconj.morphcat_person) # set tense of the original to past aconj.morphcat_subpos = "p" # fix links tnode.add_aux_anodes(acdn)
def __init__(self, scenario, args): "Constructor, just checking the argument values" super(AddClausalExpletives, self).__init__(scenario, args) if self.language is None: raise LoadingException('Language must be defined!') self.lexicon = Lexicon()
class AddClausalExpletives(AddAuxWords): """ Add clausal expletive pronoun 'to' (+preposition) to subordinate clauses with 'že', if the parent verb requires it. Arguments: language: the language of the target tree selector: the selector of the target tree """ def __init__(self, scenario, args): "Constructor, just checking the argument values" super(AddClausalExpletives, self).__init__(scenario, args) if self.language is None: raise LoadingException('Language must be defined!') self.lexicon = Lexicon() def get_aux_forms(self, tnode): "Return the clausal expletive to be added, if supposed to." # no expletives needed when there is no conjunction 'že' # (or if they are already included in the formeme) if tnode.formeme != 'v:že+fin': return None # no expletives if the parent verb is not appropriate # TODO coordinations are not handled expletive = self.lexicon.has_expletive(tnode.parent.t_lemma) if not expletive: return None # there should be an expletive -> return it return expletive.split('_') def new_aux_node(self, anode, form): "Create a node for the expletive/its preposition." new_node = anode.create_child() # expletive if re.match(r'^t(o|oho|mu|om|ím)', form): new_node.afun = 'Obj' new_node.lemma = 'ten' new_node.morphcat = {'pos': 'P', 'subpos': 'D', 'gender': 'N', 'number': 'S'} # preposition else: new_node.afun = 'AuxP' new_node.lemma = form new_node.morphcat_pos = 'R' new_node.form = form new_node.shift_before_subtree(anode) return new_node def postprocess(self, tnode, anode, aux_anodes): """\ Rehang the conjunction 'že', now above the expletive, under it. Fix clause numbers and ordering. """ # find the conjunction 'že' and its parent aconj_ze = anode.parent.parent aparent = aconj_ze.parent # rehang all expletives under the parent aux_anodes[0].parent = aparent aux_anodes[0].clause_number = aparent.clause_number if len(aux_anodes) > 1: for aux in aux_anodes[1:]: aux.parent = aux_anodes[0] aux.clause_number = aparent.clause_number # rehang the conjunction under them aconj_ze.parent = aux_anodes[-1] # shift the conjunction after the expletive aconj_ze.shift_before_subtree(anode) # hang the dependent clause under the expletive anode.parent = aconj_ze def get_anode(self, tnode): "Return the a-node that is the root of the verbal a-subtree." if tnode.get_attr('wild/conjugated'): aconj = tnode.get_deref_attr('wild/conjugated') if aconj.afun == 'AuxV': return aconj.parent return aconj else: return tnode.lex_anode
class ImposeAttrAgr(ImposeAgreement): """ Impose case, gender and number agreement of attributes with their governing nouns. Arguments: language: the language of the target tree selector: the selector of the target tree """ def __init__(self, scenario, args): "Constructor, checking the argument values" super(ImposeAttrAgr, self).__init__(scenario, args) self.lexicon = Lexicon() def should_agree(self, tnode): """\ Find adjectives with a noun parent. Returns the a-layer nodes for the adjective and its parent, or False """ if not re.search(r'(attr|poss)', tnode.formeme): return False anode = tnode.lex_anode if not anode: return False try: tnoun = tnode.get_eparents()[0] anoun = tnoun.lex_anode if anoun.is_root: return False return (anode, anoun) except: return False def process_excepts(self, tnode, match_nodes): "Handle special cases for this rule: nic/něco, numerals." anode, anoun = match_nodes if anoun.lemma in ['nic', 'něco']: # Case agreement, except in nominative and accusative, # which require genitive anode.morphcat_case = anoun.morphcat_case not in ['1', '4'] and \ anoun.morphcat_case or '2' # Forced neutrum singular anode.morphcat_number = 'S' anode.morphcat_gender = 'N' return True numeral = self.lexicon.number_for(anoun.lemma) if numeral is not None and numeral > 1: # Force plural in numerals anode.morphcat_case = anoun.morphcat_case anode.morphcat_gender = anoun.morphcat_gender anode.morphcat_number = 'P' return True return False def impose(self, tnode, match_nodes): "Impose case, gender and number agreement on attributes." anode, anoun = match_nodes # Case agreement should take place every time anode.morphcat_case = anoun.morphcat_case # Gender and number: not for nouns if tnode.formeme != 'n:attr' or tnode.mlayer_pos != 'N': anode.morphcat_number = anoun.morphcat_number anode.morphcat_gender = anoun.morphcat_gender
def __init__(self, scenario, args): "Constructor, checking the argument values" super(ImposeAttrAgr, self).__init__(scenario, args) self.lexicon = Lexicon()
class ReverseNumberNounDependency(Block): """ This block reverses the dependency of incongruent Czech numerals (5 and higher), hanging their parents under them in the a-tree. Arguments: language: the language of the target tree selector: the selector of the target tree """ def __init__(self, scenario, args): "Constructor, checking the argument values" Block.__init__(self, scenario, args) if self.language is None: raise LoadingException('Language must be defined!') self.lexicon = Lexicon() def process_ttree(self, ttree): "Rehang the numerals for the given t-tree & a-tree pair" for tnode in ttree.get_children(): self.__process_subtree(tnode) def __process_subtree(self, tnode): "Process the subtree of the given node" # solve the current node if tnode.is_coap_root(): self.__process_coap_tnode(tnode) else: self.__process_plain_tnode(tnode) # recurse deeper for child in tnode.get_children(): self.__process_subtree(child) def __process_plain_tnode(self, tnode): "Process a normal (non-coap) tnode" tnoun = tnode.parent # filter out cases where we don't need to do anything: lemma, case if tnoun < tnode or not self.__should_reverse(tnode.t_lemma): return noun_prep, noun_case = self.__get_prepcase(tnoun) if noun_case is None or noun_case not in ['1', '4']: return # make the switch self.__swap_anodes(tnode, tnoun) self.__update_formemes(tnode, tnoun, noun_prep, noun_case) # make the objects singular for Czech decimal numbers if re.match(r'^\d+[,.]\d+$', tnode.t_lemma): tnode.gram_number = 'sg' def __process_coap_tnode(self, tnode): "Process a coap root" # check if we have actually something to process tchildren = [ tchild for tchild in tnode.get_children(ordered=1) if tchild.is_member ] if not tchildren: return # check whether the switch should apply to all children tnoun = tnode.parent if tnoun < tnode or filter( lambda tchild: not self.__should_reverse(tchild.t_lemma), tchildren): return # check noun case noun_prep, noun_case = self.__get_prepcase(tnoun) if noun_case is None or noun_case not in ['1', '4']: return # switch the coap root with the noun self.__swap_anodes(tnode, tnoun) for tchild in tchildren: self.__update_formemes(tchild, tnoun, noun_prep, noun_case) # fix object number according to the last child if re.match(r'^\d+[,.]\d+$', tchildren[-1].t_lemma): tnode.gram_number = 'sg' def __update_formemes(self, tnumber, tnoun, noun_prep, noun_case): "Update the formemes to reflect the swap of the nodes" # merge number and noun prepositions number_prep = re.search(r'(?::(.*)\+)?', tnumber.formeme).group(1) if noun_prep and number_prep: preps = noun_prep + '_' + number_prep + '+' elif noun_prep or number_prep: preps = (noun_prep or number_prep) + '+' else: preps = '' # mark formeme origins for debugging tnoun.formeme_origin = 'rule-number_from_parent(%s : %s)' % \ (tnoun.formeme_origin, tnoun.formeme) tnumber.formeme_origin = 'rule-number_genitive' # Change formemes: # number gets merged preposition + noun case, noun gets genitive tnumber.formeme = 'n:%s%s' % (preps, noun_case) tnoun.formeme = 'n:2' def __swap_anodes(self, tnumber, tnoun): "Swap the dependency between a number and a noun on the a-layer" # the actual swap anumber = tnumber.lex_anode anoun = anumber.parent anumber.parent = anoun.parent anoun.parent = anumber # fix is_member if anoun.is_member: anoun.is_member = False anumber.is_member = True # fix parenthesis if anoun.get_attr('wild/is_parenthesis'): anoun.set_attr('wild/is_parenthesis', False) anumber.set_attr('wild/is_parenthesis', True) def __get_prepcase(self, tnoun): """\ Return the preposition and case of a noun formeme if the case is nominative or accusative. Returns None otherwise. """ try: return re.search(r'^n:(?:(.*)\+)?([14X])$', tnoun.formeme).groups() except: return None, None def __should_reverse(self, lemma): """\ Return true if the given lemma belongs to an incongruent numeral. This is actually a hack only to allow for translation of the English words "most" and 'more'. Normally, the method is_incongruent_numeral should be used directly. """ if self.lexicon.is_incongruent_numeral(lemma) or \ lemma in ['většina', 'menšina']: return True return False
def __init__(self, scenario, args): "Constructor, checking the argument values" super(ImposeSubjPredAgr, self).__init__(scenario, args) self.lexicon = Lexicon()
class AddSubordClausePunct(AddClausalPunct): """ Add commas separating subordinate clauses. Arguments: language: the language of the target tree selector: the selector of the target tree """ def __init__(self, scenario, args): "Constructor, just checking the argument values" Block.__init__(self, scenario, args) if self.language is None: raise LoadingException('Language must be defined!') self.lexicon = Lexicon() def process_atree(self, aroot): "Add subordinate clause punctuation to the given sentence." anodes = aroot.get_descendants(ordered=True) # examine all places between two nodes for (aleft, aright) in zip(anodes[:-1], anodes[1:]): # exclude all places where we don't want a comma # within the same clause if aleft.clause_number == aright.clause_number: continue # clause boundaries, such as brackets if aright.clause_number == 0: continue # some punctuation is here already if [ an for an in (aleft, aright) if re.match(r'^[,:;.?!-]', an.lemma) ]: continue # coordinating conjunctions or nodes in clauses belonging # to the same coordination if [ an for an in (aleft, aright) if self.lexicon.is_coord_conj(an.lemma) ]: continue if self.are_in_coord_clauses(aleft, aright): continue # left token is an opening quote or bracket if re.match(r'^[„(]', aleft.lemma): continue # right token is a closing bracket or quote followed by a period if aright.lemma == ')' or \ (aright.lemma == '“' and not aright.is_last_node() and aright.get_next_node().lemma == '.'): continue # left token is a closing quote or bracket preceded by a comma # (which has been inserted in the last step) if re.match(r'^[“)]', aleft.lemma) and not aleft.is_first_node() \ and aright.get_prev_node().lemma == ',': continue # now we know we want to insert a comma acomma = self.insert_comma_between(aleft, aright) # move the comma if the left token marks # the end of an enquoted clause if self.is_clause_in_quotes(aleft): acomma.shift_before_node(aleft) # move the comma after clausal expletives in expression "poté co" if aright.lemma == 'poté': acomma.shift_after_node(aright) def are_in_coord_clauses(self, aleft, aright): "Check if the given nodes are in two coordinated clauses." alparent = self.get_clause_parent(aleft) arparent = self.get_clause_parent(aright) return alparent == arparent and \ not alparent.is_root and is_coord_conj(alparent.lemma) def get_clause_parent(self, anode): """Return the parent of the clause the given node belongs to; the result may be the root of the tree.""" if anode.clause_number == 0: parent = anode else: parent = anode.get_clause_root().parent while parent.is_coap_root() and parent.is_member: parent = parent.parent return parent def insert_comma_between(self, aleft, aright): """Insert a comma node between these two nodes, find out where to hang it.""" # find out the parent aleft_clause_root = aleft.get_clause_root() aright_clause_root = aright.get_clause_root() ahigher_clause_root = aleft_clause_root.get_depth() > \ aright_clause_root.get_depth() and \ aleft_clause_root or aright_clause_root # insert the new node acomma = ahigher_clause_root.create_child(\ data={'form': ',', 'lemma': ',', 'afun': 'AuxX', 'morphcat': {'pos': 'Z'}, 'clause_number': 0}) # shift the new node to its rightful place acomma.shift_after_node(aleft) return acomma
class AddClausalExpletives(AddAuxWords): """ Add clausal expletive pronoun 'to' (+preposition) to subordinate clauses with 'že', if the parent verb requires it. Arguments: language: the language of the target tree selector: the selector of the target tree """ def __init__(self, scenario, args): "Constructor, just checking the argument values" super(AddClausalExpletives, self).__init__(scenario, args) if self.language is None: raise LoadingException('Language must be defined!') self.lexicon = Lexicon() def get_aux_forms(self, tnode): "Return the clausal expletive to be added, if supposed to." # no expletives needed when there is no conjunction 'že' # (or if they are already included in the formeme) if tnode.formeme != 'v:že+fin': return None # no expletives if the parent verb is not appropriate # TODO coordinations are not handled expletive = self.lexicon.has_expletive(tnode.parent.t_lemma) if not expletive: return None # there should be an expletive -> return it return expletive.split('_') def new_aux_node(self, anode, form): "Create a node for the expletive/its preposition." new_node = anode.create_child() # expletive if re.match(r'^t(o|oho|mu|om|ím)', form): new_node.afun = 'Obj' new_node.lemma = 'ten' new_node.morphcat = { 'pos': 'P', 'subpos': 'D', 'gender': 'N', 'number': 'S' } # preposition else: new_node.afun = 'AuxP' new_node.lemma = form new_node.morphcat_pos = 'R' new_node.form = form new_node.shift_before_subtree(anode) return new_node def postprocess(self, tnode, anode, aux_anodes): """\ Rehang the conjunction 'že', now above the expletive, under it. Fix clause numbers and ordering. """ # find the conjunction 'že' and its parent aconj_ze = anode.parent.parent aparent = aconj_ze.parent # rehang all expletives under the parent aux_anodes[0].parent = aparent aux_anodes[0].clause_number = aparent.clause_number if len(aux_anodes) > 1: for aux in aux_anodes[1:]: aux.parent = aux_anodes[0] aux.clause_number = aparent.clause_number # rehang the conjunction under them aconj_ze.parent = aux_anodes[-1] # shift the conjunction after the expletive aconj_ze.shift_before_subtree(anode) # hang the dependent clause under the expletive anode.parent = aconj_ze def get_anode(self, tnode): "Return the a-node that is the root of the verbal a-subtree." if tnode.get_attr('wild/conjugated'): aconj = tnode.get_deref_attr('wild/conjugated') if aconj.afun == 'AuxV': return aconj.parent return aconj else: return tnode.lex_anode
class ReverseNumberNounDependency(Block): """ This block reverses the dependency of incongruent Czech numerals (5 and higher), hanging their parents under them in the a-tree. Arguments: language: the language of the target tree selector: the selector of the target tree """ def __init__(self, scenario, args): "Constructor, checking the argument values" Block.__init__(self, scenario, args) if self.language is None: raise LoadingException('Language must be defined!') self.lexicon = Lexicon() def process_ttree(self, ttree): "Rehang the numerals for the given t-tree & a-tree pair" for tnode in ttree.get_children(): self.__process_subtree(tnode) def __process_subtree(self, tnode): "Process the subtree of the given node" # solve the current node if tnode.is_coap_root(): self.__process_coap_tnode(tnode) else: self.__process_plain_tnode(tnode) # recurse deeper for child in tnode.get_children(): self.__process_subtree(child) def __process_plain_tnode(self, tnode): "Process a normal (non-coap) tnode" tnoun = tnode.parent # filter out cases where we don't need to do anything: lemma, case if tnoun < tnode or not self.__should_reverse(tnode.t_lemma): return noun_prep, noun_case = self.__get_prepcase(tnoun) if noun_case is None or noun_case not in ['1', '4']: return # make the switch self.__swap_anodes(tnode, tnoun) self.__update_formemes(tnode, tnoun, noun_prep, noun_case) # make the objects singular for Czech decimal numbers if re.match(r'^\d+[,.]\d+$', tnode.t_lemma): tnode.gram_number = 'sg' def __process_coap_tnode(self, tnode): "Process a coap root" # check if we have actually something to process tchildren = [tchild for tchild in tnode.get_children(ordered=1) if tchild.is_member] if not tchildren: return # check whether the switch should apply to all children tnoun = tnode.parent if tnoun < tnode or filter(lambda tchild: not self.__should_reverse(tchild.t_lemma), tchildren): return # check noun case noun_prep, noun_case = self.__get_prepcase(tnoun) if noun_case is None or noun_case not in ['1', '4']: return # switch the coap root with the noun self.__swap_anodes(tnode, tnoun) for tchild in tchildren: self.__update_formemes(tchild, tnoun, noun_prep, noun_case) # fix object number according to the last child if re.match(r'^\d+[,.]\d+$', tchildren[-1].t_lemma): tnode.gram_number = 'sg' def __update_formemes(self, tnumber, tnoun, noun_prep, noun_case): "Update the formemes to reflect the swap of the nodes" # merge number and noun prepositions number_prep = re.search(r'(?::(.*)\+)?', tnumber.formeme).group(1) if noun_prep and number_prep: preps = noun_prep + '_' + number_prep + '+' elif noun_prep or number_prep: preps = (noun_prep or number_prep) + '+' else: preps = '' # mark formeme origins for debugging tnoun.formeme_origin = 'rule-number_from_parent(%s : %s)' % \ (tnoun.formeme_origin, tnoun.formeme) tnumber.formeme_origin = 'rule-number_genitive' # Change formemes: # number gets merged preposition + noun case, noun gets genitive tnumber.formeme = 'n:%s%s' % (preps, noun_case) tnoun.formeme = 'n:2' def __swap_anodes(self, tnumber, tnoun): "Swap the dependency between a number and a noun on the a-layer" # the actual swap anumber = tnumber.lex_anode anoun = anumber.parent anumber.parent = anoun.parent anoun.parent = anumber # fix is_member if anoun.is_member: anoun.is_member = False anumber.is_member = True # fix parenthesis if anoun.get_attr('wild/is_parenthesis'): anoun.set_attr('wild/is_parenthesis', False) anumber.set_attr('wild/is_parenthesis', True) def __get_prepcase(self, tnoun): """\ Return the preposition and case of a noun formeme if the case is nominative or accusative. Returns None otherwise. """ try: return re.search(r'^n:(?:(.*)\+)?([14X])$', tnoun.formeme).groups() except: return None, None def __should_reverse(self, lemma): """\ Return true if the given lemma belongs to an incongruent numeral. This is actually a hack only to allow for translation of the English words "most" and 'more'. Normally, the method is_incongruent_numeral should be used directly. """ if self.lexicon.is_incongruent_numeral(lemma) or \ lemma in ['většina', 'menšina']: return True return False
class AddAppositionPunct(Block): """ Separating Czech appositions, such as in 'John, my best friend, ...' with commas. Arguments: language: the language of the target tree selector: the selector of the target tree """ def __init__(self, scenario, args): "Constructor, just checking the argument values" Block.__init__(self, scenario, args) if self.language is None: raise LoadingException('Language must be defined!') self.lexicon = Lexicon() def process_tnode(self, tnode): "Adds punctuation a-nodes if the given node is an apposition node." tparent = tnode.parent # the apposition is correctly parsed on t-layer if tnode.functor == 'APPS': # just add second comma acomma = self.add_comma_node(tnode.lex_anode) acomma.shift_after_subtree(tnode.lex_anode) # the apposition is expressed as n:attr on the t-layer, where the # attribute is a named entity label # and follows its parent, which is also a noun. elif tnode.formeme == 'n:attr' and tnode.gram_sempos == 'n.denot' and \ tparent < tnode and tparent.formeme.startswith('n:') and \ (self.lexicon.is_personal_role(tnode.t_lemma) or self.lexicon.is_named_entity_label(tnode.t_lemma)): # create the apposition on the t-layer tgrandpa = tparent.parent tapp = tgrandpa.create_child(data={'functor': 'APPS', 't_lemma': ';', 'nodetype': 'coap'}) tapp.shift_before_subtree(tnode) tparent.parent = tapp tnode.parent = tapp # create the apposition on the a-layer # TODO hang under the apposition not only the lex_anode, # but also aux anodes (if they are above lex_anode). agrandpa = tgrandpa.lex_anode if tgrandpa.lex_anode \ else tnode.lex_anode.root aapp_left = self.add_comma_node(agrandpa) aapp_left.afun = 'Apos' aapp_left.shift_before_subtree(tnode.lex_anode) tnode.lex_anode.parent = aapp_left tnode.lex_anode.is_member = True tparent.lex_anode.parent = aapp_left tparent.lex_anode.is_member = True tapp.lex_anode = aapp_left # create right comma if not self.is_before_punct(tnode.lex_anode): aapp_right = self.add_comma_node(aapp_left) aapp_right.shift_after_subtree(tnode.lex_anode) tapp.add_aux_anodes(aapp_right) def add_comma_node(self, aparent): "Add a comma a-node to the given parent" return aparent.create_child(data={'lemma': ',', 'form': ',', 'afun': 'AuxX'}) def is_before_punct(self, anode): """\ Test whether the subtree of the given node precedes a punctuation node. """ next_node = anode.get_descendants(add_self=True, ordered=True)[-1].get_next_node() return not next_node or re.match(r'[;.,?!„“‚‘"]', next_node.lemma)
class AddSubordClausePunct(AddClausalPunct): """ Add commas separating subordinate clauses. Arguments: language: the language of the target tree selector: the selector of the target tree """ def __init__(self, scenario, args): "Constructor, just checking the argument values" Block.__init__(self, scenario, args) if self.language is None: raise LoadingException('Language must be defined!') self.lexicon = Lexicon() def process_atree(self, aroot): "Add subordinate clause punctuation to the given sentence." anodes = aroot.get_descendants(ordered=True) # examine all places between two nodes for (aleft, aright) in zip(anodes[:-1], anodes[1:]): # exclude all places where we don't want a comma # within the same clause if aleft.clause_number == aright.clause_number: continue # clause boundaries, such as brackets if aright.clause_number == 0: continue # some punctuation is here already if [an for an in (aleft, aright) if re.match(r'^[,:;.?!-]', an.lemma)]: continue # coordinating conjunctions or nodes in clauses belonging # to the same coordination if [an for an in (aleft, aright) if self.lexicon.is_coord_conj(an.lemma)]: continue if self.are_in_coord_clauses(aleft, aright): continue # left token is an opening quote or bracket if re.match(r'^[„(]', aleft.lemma): continue # right token is a closing bracket or quote followed by a period if aright.lemma == ')' or \ (aright.lemma == '“' and not aright.is_last_node() and aright.get_next_node().lemma == '.'): continue # left token is a closing quote or bracket preceded by a comma # (which has been inserted in the last step) if re.match(r'^[“)]', aleft.lemma) and not aleft.is_first_node() \ and aright.get_prev_node().lemma == ',': continue # now we know we want to insert a comma acomma = self.insert_comma_between(aleft, aright) # move the comma if the left token marks # the end of an enquoted clause if self.is_clause_in_quotes(aleft): acomma.shift_before_node(aleft) # move the comma after clausal expletives in expression "poté co" if aright.lemma == 'poté': acomma.shift_after_node(aright) def are_in_coord_clauses(self, aleft, aright): "Check if the given nodes are in two coordinated clauses." alparent = self.get_clause_parent(aleft) arparent = self.get_clause_parent(aright) return alparent == arparent and \ not alparent.is_root and is_coord_conj(alparent.lemma) def get_clause_parent(self, anode): """Return the parent of the clause the given node belongs to; the result may be the root of the tree.""" if anode.clause_number == 0: parent = anode else: parent = anode.get_clause_root().parent while parent.is_coap_root() and parent.is_member: parent = parent.parent return parent def insert_comma_between(self, aleft, aright): """Insert a comma node between these two nodes, find out where to hang it.""" # find out the parent aleft_clause_root = aleft.get_clause_root() aright_clause_root = aright.get_clause_root() ahigher_clause_root = aleft_clause_root.get_depth() > \ aright_clause_root.get_depth() and \ aleft_clause_root or aright_clause_root # insert the new node acomma = ahigher_clause_root.create_child(\ data={'form': ',', 'lemma': ',', 'afun': 'AuxX', 'morphcat': {'pos': 'Z'}, 'clause_number': 0}) # shift the new node to its rightful place acomma.shift_after_node(aleft) return acomma