Пример #1
0
def scrape_adverbs(filename):
    """Gerunds (lyap) and infinitives (tum).

    'tvA' gerunds come from SL_adverbs.xml.
    """
    labels = ['name', 'root', 'pos', 'modification']
    format_str = ','.join('{%s}' % x for x in labels)

    output = []
    output.append(','.join(labels))
    for xml in scrape_utils.iter_xml(filename):
        vu = xml.find('vu')
        if vu is None:
            continue

        name = xml.attrib['form']
        root = xml.find('s').attrib['stem']

        modification = trans[vu.find('cj')[0].tag]
        huet_pos = vu.find('iv')[0].tag
        if huet_pos == 'abs':
            pos = 'gerund'
        elif huet_pos == 'inf':
            pos = 'infinitive'
        else:
            raise Exception("Unknown POS %s" % huet_pos)

        output.append(format_str.format(**{
            'name': name,
            'root': root,
            'pos': pos,
            'modification': modification or '',
        }))

    return '\n'.join(output)
Пример #2
0
def scrape_adverbs(filename):
    """Gerunds (ktvA).

    Infinitives (tum) come from SL_final.xml, and other indeclinables
    come from the MW data.
    """
    labels = ['name', 'root', 'pos', 'modification']
    format_str = ','.join('{%s}' % x for x in labels)

    output = []
    output.append(','.join(labels))
    for xml in scrape_utils.iter_xml(filename):
        # Only gerunds
        ab = xml.find('ab')
        if not ab:
            continue

        name = xml.attrib['form']
        root = xml.find('s').attrib['stem']
        modification = trans[ab.find('cj')[0].tag]

        # Filter out e.g. "Asam"
        if name[-2:] not in ('vA', 'ya'):
            continue

        output.append(
            format_str.format(
                **{
                    'name': name,
                    'root': root,
                    'pos': 'gerund',
                    'modification': modification or '',
                }))

    return '\n'.join(output)
Пример #3
0
def scrape_adverbs(filename):
    """Gerunds (ktvA).

    Infinitives (tum) come from SL_final.xml, and other indeclinables
    come from the MW data.
    """
    labels = ['name', 'root', 'pos', 'modification']
    format_str = ','.join('{%s}' % x for x in labels)

    output = []
    output.append(','.join(labels))
    for xml in scrape_utils.iter_xml(filename):
        # Only gerunds
        ab = xml.find('ab')
        if not ab:
            continue

        name = xml.attrib['form']
        root = xml.find('s').attrib['stem']
        modification = trans[ab.find('cj')[0].tag]

        # Filter out e.g. "Asam"
        if name[-2:] not in ('vA', 'ya'):
            continue

        output.append(format_str.format(**{
            'name': name,
            'root': root,
            'pos': 'gerund',
            'modification': modification or '',
        }))

    return '\n'.join(output)
Пример #4
0
def scrape(filename):
    """Inflected verbs"""
    labels = ['form', 'root', 'class', 'person', 'number', 'mode', 'voice',
              'modification']
    rows = []

    num_written = 0
    for xml in scrape_utils.iter_xml(filename):
        v = xml.find('v')
        cj = v.find('cj')
        _sys = v.find('sys')
        tense = _sys[0]
        np = v.find('np')
        s = xml.find('s')

        # Present system (present, imperfect, imperative, optative)
        if tense.tag == 'prs':
            vclass = tense.attrib.get('k', None)
            mode = trans[tense.find('md')[0].tag]
            voice = trans[tense[1].tag]

        # "Tense paradigm" (future, aorist, conditional, perfect,
        #                   injunctive, benedictive)
        elif tense.tag == 'tp':
            vclass = None
            mode = trans[tense[0].tag]
            voice = trans[tense[1].tag]

        # Passive
        elif tense.tag == 'pas':
            vclass = None
            mode = trans[tense.find('md')[0].tag]
            voice = trans[tense.tag]

        # Periphrastic future
        elif tense.tag == 'pef':
            vclass = None
            mode = trans[tense.tag]
            voice = trans[tense[0].tag]
        else:
            print ET.tostring(xml)

        name = xml.attrib['form']
        root = s.attrib['stem']
        person = trans[np[1].tag]
        number = trans[np[0].tag]
        modification = trans[cj[0].tag]

        # Denominative
        if vclass == '11':
            vclass = 'denom'

        # For non-classed verb forms.
        vclass = vclass or ''

        rows.append((name, root, vclass, person, number, mode, voice,
            modification))
        num_written += 1

    return labels, rows
Пример #5
0
def scrape(parts_file):
    """Participles."""

    labels = ['stem', 'root', 'class', 'mode', 'voice', 'modification']
    rows = []
    num_written = 0

    for xml in scrape_utils.iter_xml(parts_file):
        form = xml.attrib['form']
        root = xml.find('s').attrib['stem']

        for pa in xml.findall('pa'):
            # Inflectional info
            na = pa.find('na')
            case = trans[na[0].tag]
            number = trans[na[1].tag]
            gender = trans[na[2].tag]

            if (gender, case, number) != ('m', '1', 's'):
                continue

            # Morphological info (stem)
            modification = trans[pa.find('cj')[0].tag]
            mode_elem = pa.find('no')[0]
            mode, voice = trans[mode_elem.tag]
            if (mode, voice) == ('pres', 'active'):
                vclass = mode_elem[0].text
                voice = trans[mode_elem[1].tag]
            elif (mode, voice) in [('fut', 'active'), ('perf', 'active')]:
                vclass = None
                voice = trans[mode_elem[0].tag]
            else:
                vclass = None
                # voice = default

            # '11', '12', and '13' refer to "modified" verb classes. We can
            # just discard these.
            if vclass and modification is not None:
                vclass = None

            # Construct stem
            if form[-1] == 's':
                stem = form[:-1]
            elif form.endswith('an'):
                stem = form[:-1] + 't'   # -an  -> -at
            elif mode == 'perf':
                stem = form[:-2] + 'as'  # -vAn -> -vas
            elif mode == 'past':
                stem = form[:-2] + 'at'  # -vAn -> -vat
            else:
                # Encoding error, but high recall is OK.
                stem = form

            rows.append((stem, root, vclass, mode, voice, modification))
            num_written += 1

    return labels, rows
def scrape(parts_file):
    """Participles."""

    labels = ['stem', 'root', 'class', 'mode', 'voice', 'modification']
    rows = []
    num_written = 0

    for xml in scrape_utils.iter_xml(parts_file):
        form = xml.attrib['form']
        root = xml.find('s').attrib['stem']

        for pa in xml.findall('pa'):
            # Inflectional info
            na = pa.find('na')
            case = trans[na[0].tag]
            number = trans[na[1].tag]
            gender = trans[na[2].tag]

            if (gender, case, number) != ('m', '1', 's'):
                continue

            # Morphological info (stem)
            modification = trans[pa.find('cj')[0].tag]
            mode_elem = pa.find('no')[0]
            mode, voice = trans[mode_elem.tag]
            if (mode, voice) == ('pres', 'active'):
                vclass = mode_elem[0].text
                voice = trans[mode_elem[1].tag]
            elif (mode, voice) in [('fut', 'active'), ('perf', 'active')]:
                vclass = None
                voice = trans[mode_elem[0].tag]
            else:
                vclass = None
                # voice = default

            # '11', '12', and '13' refer to "modified" verb classes. We can
            # just discard these.
            if vclass and modification is not None:
                vclass = None

            # Construct stem
            if form[-1] == 's':
                stem = form[:-1]
            elif form.endswith('an'):
                stem = form[:-1] + 't'   # -an  -> -at
            elif mode == 'perf':
                stem = form[:-2] + 'as'  # -vAn -> -vas
            elif mode == 'past':
                stem = form[:-2] + 'at'  # -vAn -> -vat
            else:
                # Encoding error, but high recall is OK.
                stem = form

            rows.append((stem, root, vclass, mode, voice, modification))
            num_written += 1
Пример #7
0
def scrape_adverbs(filename):
    """Gerunds (lyap) and infinitives (tum).

    'tvA' gerunds come from SL_adverbs.xml.
    """
    labels = ['name', 'root', 'pos', 'modification']
    format_str = ','.join('{%s}' % x for x in labels)

    output = []
    output.append(','.join(labels))
    for xml in scrape_utils.iter_xml(filename):
        vu = xml.find('vu')
        if vu is None:
            continue

        name = xml.attrib['form']
        root = xml.find('s').attrib['stem']

        modification = trans[vu.find('cj')[0].tag]
        huet_pos = vu.find('iv')[0].tag
        if huet_pos == 'abs':
            pos = 'gerund'
        elif huet_pos == 'inf':
            pos = 'infinitive'
        else:
            raise Exception("Unknown POS %s" % huet_pos)

        output.append(
            format_str.format(
                **{
                    'name': name,
                    'root': root,
                    'pos': pos,
                    'modification': modification or '',
                }))

    return '\n'.join(output)