Пример #1
0
def get_words_from_kap(node):
    r"""Return a list of all the terms in a <kap>. Every term in a
    <kap> is an alternative spelling of the same term. This is not
    necessarily single words, since ReVo includes entries such as
    'brazila nukso'.

    <kap><ofc>*</ofc><tld/>o</kap>
    <kap>brazil<tld/>arbo, <var><kap>brazila <tld/>arbo</kap></var></kap>
    (from nuks.xml)

    The heavy lifting is done in flatten_kap, all we do here is
    separate out terms and remove extraneous whitespace.

    Possible formats encountered:
    'foo'
    'foo, bar'
    'foo,\n   bar'
    '(n,p)-matrico' (the only term in ReVo with an internal comma)

    """
    flat_string = flatten_node(node, skip_tags=['ofc', 'fnt'])

    if flat_string == '(n,p)-matrico':
        words = ['(n,p)-matrico']
    else:
        words = flat_string.split(',')
    if len(words) > 1:
        for i in range(len(words)):
            # remove trailing/leading space and awkard newlines
            words[i] = clean_string(words[i])

    return words
Пример #2
0
def get_words_from_kap(node):
    r"""Return a list of all the terms in a <kap>. Every term in a
    <kap> is an alternative spelling of the same term. This is not
    necessarily single words, since ReVo includes entries such as
    'brazila nukso'.

    <kap><ofc>*</ofc><tld/>o</kap>
    <kap>brazil<tld/>arbo, <var><kap>brazila <tld/>arbo</kap></var></kap>
    (from nuks.xml)

    The heavy lifting is done in flatten_kap, all we do here is
    separate out terms and remove extraneous whitespace.

    Possible formats encountered:
    'foo'
    'foo, bar'
    'foo,\n   bar'
    '(n,p)-matrico' (the only term in ReVo with an internal comma)

    """
    flat_string = flatten_node(node, skip_tags=["ofc", "fnt"])

    if flat_string == "(n,p)-matrico":
        words = ["(n,p)-matrico"]
    else:
        words = flat_string.split(",")
    if len(words) > 1:
        for i in range(len(words)):
            # remove trailing/leading space and awkard newlines
            words[i] = clean_string(words[i])

    return words
Пример #3
0
def flatten_node(node, skip_tags=None):
    """Return a friendly string representing the contents of this node
    and its children. This method is generic although occasionally we
    need methods which are specific to a certain node type.

    skip_tags specifies node tags for a node which we don't recurse
    into (although we will collect its tail, since that is outside).

    Some examples:

    <rim>
      La tuta terminologio pri <tld/>oj, <tld/>-vektoroj kaj -subspacoj
      de endomorfio ekzistas anka&ubreve; 
      por <frm>(<k>n</k>,<k>n</k>)</frm>-matrico, konvencie
      identigita kun la endomorfio, kies matrico rilate al la kanona bazo
      de <frm><g>K</g><sup><k>n</k></sup></frm> &gcirc;i estas.
    </rim>
    (from ajgen.xml)

    <ekz>
      <ctl>popolo</ctl>, <ctl>foliaro</ctl>, <ctl>herbo</ctl>,
      <ctl>armeo</ctl> estas ar<tld/>oj.
    </ekz>
    (from vort.xml)

    <ekz>
      <ind>saluton!</ind>
      [...]
    </ekz>
    (from salut.xml)

    <klr>(de <ref cel="polino.0o">polinomo</ref>)</klr>
    (from radik.xml)

    """
    flatten_method = get_flatten_method(node)

    flat_string = flatten_method(node)
    
    for child in node.getchildren():
        flat_string += _flatten(child, skip_tags)

    return clean_string(flat_string)
Пример #4
0
def get_examples(node):
    """Get all examples from the children of a node. Examples tend to
    be in <dif>s, and take the following form:

    <ekz>
      simpla, kunmetita, dubsenca <tld/>o;
    </ekz><ekz>
      uzi la &gcirc;ustan, konvenan <tld/>on;
    </ekz><ekz>
      la bildoj elvokitaj de la <tld/>oj;
    </ekz><ekz>
      <tld/>ordo.
    </ekz>
    (from vort.xml)

    Sometimes (bizarrely) examples spread across several <ekz> nodes:

    <ekz>
      <tld/>i al si plezuron<fnt>Z</fnt>;
    </ekz><ekz>
      <tld/>i instruon<fnt>Z</fnt>,
    </ekz><ekz>
      amikecon<fnt>Z</fnt>,
    [...]
    (from sercx.xml)

    Sometimes only references, which we discard:

    <ekz>
      <ref tip="sub" cel="bier.0o">biero</ref>, 
      <ref tip="sub" cel="brand.0o">brando</ref>,
      <ref tip="sub" cel="vin.0o">vino</ref> 
    </ekz>
    (from alkoho.xml)

    <subsnc mrk="afekt.0o.sxajnigi" ref="afekt.0i.sxajnigi">
      <ekz>
        kiom a&ccirc;as la <tld/>o komplezi al duonvivul'
    (from afekt.xml)

    """
    raw_examples = []

    # examples tend to be on <dif>s
    for dif_node in node.findall('dif'):
        for ekz_node in dif_node.findall('ekz'):
            raw_example = flatten_example(ekz_node)
            if raw_example:
                raw_examples.append(raw_example)

    # but examples can also be on the <snc>/<subsnc> itself
    # (or even a <drv>!)
    for ekz_node in node.findall('ekz'):
        raw_example = flatten_example(ekz_node)
        if raw_example:
            raw_examples.append(raw_example)

    # fix examples spread over multiple <ekz>s by concatenating each
    # example that ends with a comma with the next example
    examples = []
    example_string = ""
    for (example, source) in raw_examples:
        example_string += ' ' + example

        if not example_string.endswith(','):
            examples.append((clean_string(example_string), source))
            example_string = ""

    if example_string != "":
        art_node = ekz_node.iterancestors('art').next()
        kap_node = art_node.iter('kap').next()
        word = get_words_from_kap(kap_node)[0]
        print("Warning: example for %s ended with comma: %s" % \
            (word, clean_string(example_string)))
            
    return examples
Пример #5
0
 def get_args(self, message, lower=True):
     messy_args = message.clean_content[len(f'okuyasu {self.name} '):]
     return clean_string(messy_args, lower=lower)