def __determine_nom_prefix(diamond): """ determines, which character will be used as a prefix for <nom> tags. usually, its the first character used in the corresponding <prop> tag, (e.g. prop = "und" will turn nom = "konjunktion" into nom = "u1:konjunktion", iff its the 1st "konjunktion" beginning with "u" in that sentence). :type diamond: ``Diamond`` :rtype: ``str`` :return: a single character """ numbers_only = re.compile("\d+$") if "prop" in diamond.keys(): prop = ensure_utf8(diamond["prop"]) if numbers_only.match(prop): nom_prefix_char = "n" else: # <prop> doesn't represent a year, page count etc. nom_prefix_char = diamond["prop"].lower()[0] else: #if there's no <prop> tag nom_prefix_char = "x" return ensure_utf8(nom_prefix_char)
def __determine_nom_prefix(diamond): """ determines, which character will be used as a prefix for <nom> tags. usually, its the first character used in the corresponding <prop> tag, (e.g. prop = "und" will turn nom = "konjunktion" into nom = "u1:konjunktion", iff its the 1st "konjunktion" beginning with "u" in that sentence). @type diamond: C{Diamond} @rtype: C{str} @return: a single character """ numbers_only = re.compile("\d+$") if "prop" in diamond.keys(): prop = ensure_utf8(diamond["prop"]) if numbers_only.match(prop): nom_prefix_char = "n" else: # <prop> doesn't represent a year, page count etc. nom_prefix_char = diamond["prop"].lower()[0] else: #if there's no <prop> tag nom_prefix_char = "x" return ensure_utf8(nom_prefix_char)
def convert_diamond_xml2fs(etree): """ transforms a HLDS XML <diamond>...</diamond> structure (that was parsed into an etree element) into an NLTK feature structure. @type etree_or_tuple: C{etree._Element} @param etree_or_tuple: a diamond etree element @rtype: C{Diamond} """ mode = ensure_utf8(etree.attrib["mode"]) nested_diamonds = [] nom = "" # default value prop = "" # default value for child in etree.getchildren(): if child.tag == "diamond": nested_diamond = convert_diamond_xml2fs(child) nested_diamonds.append(nested_diamond) elif child.tag == "nom": nom = ensure_utf8(child.attrib["name"]) elif child.tag == "prop": prop = ensure_utf8(child.attrib["name"]) return create_diamond(mode, nom, prop, nested_diamonds)
def convert_diamond_xml2fs(etree): """ transforms a HLDS XML <diamond>...</diamond> structure (that was parsed into an etree element) into an NLTK feature structure. :type etree_or_tuple: ``etree._Element`` :param etree_or_tuple: a diamond etree element :rtype: ``Diamond`` """ mode = ensure_utf8(etree.attrib["mode"]) nested_diamonds = [] nom = "" # default value prop = "" # default value for child in etree.getchildren(): if child.tag == "diamond": nested_diamond = convert_diamond_xml2fs(child) nested_diamonds.append(nested_diamond) elif child.tag == "nom": nom = ensure_utf8(child.attrib["name"]) elif child.tag == "prop": prop = ensure_utf8(child.attrib["name"]) return create_diamond(mode, nom, prop, nested_diamonds)
def add_nom_prefixes(diamond): """ Adds a prefix/index to the name attribute of every <nom> tag of a C{Diamond} or C{Sentence} structure. Without this, I{ccg-realize} will only produce gibberish. Every <nom> tag has a 'name' attribute, which contains a category/type-like description of the corresponding <prop> tag's name attribute, e.g.:: <diamond mode="PRÄP"> <nom name="v1:zugehörigkeit"/> <prop name="von"/> </diamond> Here 'zugehörigkeit' is the name of a category that the preposition 'von' belongs to. usually, the nom prefix is the first character of the prop name attribute with an added index. index iteration is done by a depth-first walk through all diamonds contained in the given feature structure. In this example 'v1:zugehörigkeit' means, that "von" is the first C{diamond} in the structure that starts with 'v' and belongs to the category 'zugehörigkeit'. """ prop_dict = defaultdict(int) elements = [element for element in diamond.walk()] for e in elements: if type(e) is Diamond: if "nom" in e.keys(): nom_prefix_char = __determine_nom_prefix(e) prop_dict[nom_prefix_char] += 1 nom_without_prefix = e["nom"] nom_type = type(nom_without_prefix) e["nom"] = "{0}{1}:{2}".format(ensure_utf8(nom_prefix_char), prop_dict[nom_prefix_char], ensure_utf8(nom_without_prefix)) if nom_type == unicode: # preserve unicode, if the string was unicode encoded before e["nom"] = ensure_unicode(e["nom"])
def add_nom_prefixes(diamond): """ Adds a prefix/index to the name attribute of every <nom> tag of a ``Diamond`` or ``Sentence`` structure. Without this, ``ccg-realize`` will only produce gibberish. Every <nom> tag has a 'name' attribute, which contains a category/type-like description of the corresponding <prop> tag's name attribute, e.g.:: <diamond mode="PRÄP"> <nom name="v1:zugehörigkeit"/> <prop name="von"/> </diamond> Here 'zugehörigkeit' is the name of a category that the preposition 'von' belongs to. usually, the nom prefix is the first character of the prop name attribute with an added index. index iteration is done by a depth-first walk through all diamonds contained in the given feature structure. In this example 'v1:zugehörigkeit' means, that "von" is the first ``diamond`` in the structure that starts with 'v' and belongs to the category 'zugehörigkeit'. """ prop_dict = defaultdict(int) elements = [element for element in diamond.walk()] for e in elements: if type(e) is Diamond: if "nom" in e.keys(): nom_prefix_char = __determine_nom_prefix(e) prop_dict[nom_prefix_char] += 1 nom_without_prefix = e["nom"] nom_type = type(nom_without_prefix) e["nom"] = "{0}{1}:{2}".format(ensure_utf8(nom_prefix_char), prop_dict[nom_prefix_char], ensure_utf8(nom_without_prefix)) if nom_type == unicode: # preserve unicode, if the string was unicode encoded before e["nom"] = ensure_unicode(e["nom"])
def parse_sentence(self, sentence, single_sent=True): if single_sent is False: item = sentence satop = item.find("xml/lf/satop") # root (verb) of the sentence # <item numOfParses="4" string="er beschreibt sie"> sentence_string = ensure_utf8(item.attrib["string"]) expected_parses = item.attrib["numOfParses"] elif single_sent is True: satop = sentence root = sentence.getroottree() target_element = root.find("target") if target_element is not None: sentence_string = target_element.text else: sentence_string = "" expected_parses = 1 # <satop nom="b1:handlung"> # <prop name="beschreiben"/> root_prop = "" # some HLDS satop structures don't have a <prop> tag if satop.find("prop") is not None: root_prop = satop.find("prop").attrib["name"] root_nom = satop.attrib["nom"] elements = [] for element in satop.findall("diamond"): diamond = convert_diamond_xml2fs(element) elements.append(diamond) sentence = Sentence() sentence.create_sentence(sentence_string, expected_parses, root_nom, root_prop, elements) return sentence
def featstruct2avm(featstruct, mode="non-recursive"): """ converts an NLTK feature structure into an attribute-value matrix that can be printed with LaTeX's avm environment. @type featstruct: C{nltk.featstruct} or C{Diamond} or C{Sentence} @rtype: C{str} """ ret_str = "\[ " for key, val in sorted(featstruct.items()): if isinstance(val, Diamond): #handles nested Diamond structures diamond_key = val[Feature("mode")] diamond_val = featstruct2avm(val, mode="recursive") ret_str += "{0} & {1} \\\\\n".format( ensure_utf8(diamond_key), ensure_utf8(diamond_val)) elif isinstance(val, nltk.FeatStruct): #every other subclass of FeatStruct incl. FeatStruct nested_featstruct = featstruct2avm(val, mode="recursive") ret_str += "{0} & {1} \\\\\n".format( ensure_utf8(key), ensure_utf8(nested_featstruct)) else: # normal key, value pairs within a FeatStruct if key in (Feature("mode"), Feature("expected_parses")): continue # don't print "mode" or "expected_parses" keys elif key == Feature("root_nom"): key = Feature("nom") elif key == Feature("root_prop"): key = Feature("prop") ret_str += "{0} & `{1}' \\\\\n".format( ensure_utf8(key), ensure_utf8(val)) ret_str += " \]\n" if mode == "non-recursive": clean_ret_str = ret_str.replace("*", "$*$").replace("_", "\_") ret_str = "{0}\n{1}{2}".format('\\begin{avm}', clean_ret_str, '\\end{avm}') return ret_str
def featstruct2avm(featstruct, mode="non-recursive"): """ converts an NLTK feature structure into an attribute-value matrix that can be printed with LaTeX's avm environment. :type featstruct: ``nltk.featstruct`` or ``Diamond`` or ``Sentence`` :rtype: ``str`` """ ret_str = "\[ " for key, val in sorted(featstruct.items()): if isinstance(val, Diamond): #handles nested Diamond structures diamond_key = val[Feature("mode")] diamond_val = featstruct2avm(val, mode="recursive") ret_str += "{0} & {1} \\\\\n".format(ensure_utf8(diamond_key), ensure_utf8(diamond_val)) elif isinstance(val, nltk.FeatStruct): #every other subclass of FeatStruct incl. FeatStruct nested_featstruct = featstruct2avm(val, mode="recursive") ret_str += "{0} & {1} \\\\\n".format( ensure_utf8(key), ensure_utf8(nested_featstruct)) else: # normal key, value pairs within a FeatStruct if key in (Feature("mode"), Feature("expected_parses")): continue # don't print "mode" or "expected_parses" keys elif key == Feature("root_nom"): key = Feature("nom") elif key == Feature("root_prop"): key = Feature("prop") ret_str += "{0} & `{1}' \\\\\n".format(ensure_utf8(key), ensure_utf8(val)) ret_str += " \]\n" if mode == "non-recursive": clean_ret_str = ret_str.replace("*", "$*$").replace("_", "\_") ret_str = "{0}\n{1}{2}".format('\\begin{avm}', clean_ret_str, '\\end{avm}') return ret_str