Exemplo n.º 1
0
def read_rogetmap(xml, verbose=True):
    """Parse Roget map (Roget hierarchy) into a dictionary with Roget head words as keys."""
    if verbose:
        log.info("Reading XML lexicon")
    lexicon = {}
    context = etree.iterparse(xml, events=("start", "end"))
    context = iter(context)

    for _event, elem in context:
        if elem.tag == "class":
            l1 = elem.get("name")
        elif elem.tag == "section":
            l2 = elem.get("name")
        elif elem.tag == "subsection":
            l3 = elem.get("name")
        elif elem.tag == "headword":
            head = elem.get("name")
            lexicon[head] = (l3, l2, l1)

    testwords = ["Existence", "Health", "Amusement", "Marriage"]
    util.test_lexicon(lexicon, testwords)

    if verbose:
        log.info("OK, read.")
    return lexicon
Exemplo n.º 2
0
def read_blingbring(tsv, classmap, verbose=True):
    """Read the tsv version of the Blingbring lexicon (blingbring.xml).

    Return a lexicon dictionary: {senseid: {roget_head: roget_head,
                                            roget_subsection: roget_subsection,
                                            roget_section: roget_section,
                                            roget_class: roget_class,
                                            bring: bring_ID}
    """
    rogetdict = read_rogetmap(xml=classmap, verbose=True)

    import csv

    if verbose:
        log.info("Reading tsv lexicon")
    lexicon = {}
    classmapping = {}

    with open(tsv) as f:
        for line in csv.reader(f, delimiter="\t"):
            if line[0].startswith("#"):
                continue
            rogetid = line[1].split("/")[-1]
            if rogetid in rogetdict:
                roget_l3 = rogetdict[rogetid][0]  # subsection
                roget_l2 = rogetdict[rogetid][1]  # section
                roget_l1 = rogetdict[rogetid][2]  # class
            else:
                roget_l3 = roget_l2 = roget_l1 = ""
            senseids = set(line[3].split(":"))
            for senseid in senseids:
                lexicon.setdefault(senseid, set()).add(
                    (rogetid, roget_l3, roget_l2, roget_l1))

            # Make mapping between Roget and Bring classes
            if line[0].split("/")[1] == "B":
                classmapping[rogetid] = line[2]

    for senseid, rogetids in lexicon.items():
        roget_head = set([tup[0] for tup in rogetids])
        roget_subsection = set([tup[1] for tup in rogetids if tup[1]])
        roget_section = set([tup[2] for tup in rogetids if tup[2]])
        roget_class = set([tup[3] for tup in rogetids if tup[3]])
        lexicon[senseid] = {
            "roget_head": roget_head,
            "roget_subsection": roget_subsection,
            "roget_section": roget_section,
            "roget_class": roget_class,
            "bring": set([classmapping[r] for r in roget_head])
        }

    testwords = ["fågel..1", "behjälplig..1", "köra_ner..1"]
    util.test_lexicon(lexicon, testwords)

    if verbose:
        log.info("OK, read")
    return lexicon
Exemplo n.º 3
0
def read_xml(xml):
    """Read the XML version of crosslinked lexicon."""
    log.info("Reading XML lexicon")
    lexicon = {}

    context = etree.iterparse(
        xml,
        events=("start",
                "end"))  # "start" needed to save reference to root element
    context = iter(context)
    _event, root = next(context)

    for event, elem in context:
        if event == "end":
            if elem.tag == 'LexicalEntry':

                lemma = elem.find("Lemma")
                dalin, saldo = [], ''
                for form in lemma.findall("FormRepresentation"):
                    cat = _findval(form, "category")
                    lem = _findval(form, "lemgram")
                    if cat == "modern":
                        saldo = lem
                    else:
                        match = _findval(form, "match")
                        dalin += [(lem, match)]

                [
                    lexicon.update({d: {
                        'saldo': saldo,
                        'match': m
                    }}) for (d, m) in dalin
                ]

            # Done parsing section. Clear tree to save memory
            if elem.tag in ['LexicalEntry', 'frame', 'resFrame']:
                root.clear()

    testwords = ["tigerhjerta..nn.1", "lågland..nn.1", "gud..nn.1"]
    util.test_lexicon(lexicon, testwords)

    log.info("OK, read")
    return lexicon
Exemplo n.º 4
0
def read_swefn(xml, verbose=True):
    """Read the XML version of the swedish Framenet resource.

    Return a lexicon dictionary, {saldoID: {swefnID}}.
    """
    if verbose:
        log.info("Reading XML lexicon")
    lexicon = {}

    context = etree.iterparse(
        xml,
        events=("start",
                "end"))  # "start" needed to save reference to root element
    context = iter(context)
    _event, root = next(context)

    for event, elem in context:
        if event == "end":
            if elem.tag == 'LexicalEntry':
                sense = elem.find("Sense")
                sid = sense.get("id").lstrip("swefn--")
                for lu in sense.findall("feat[@att='LU']"):
                    saldosense = lu.get("val")
                    lexicon.setdefault(saldosense, set()).add(sid)

            # Done parsing section. Clear tree to save memory
            if elem.tag in ['LexicalEntry', 'frame', 'resFrame']:
                root.clear()

    testwords = [
        "slant..1", "befrielse..1", "granne..1", "sisådär..1",
        "mjölkcentral..1"
    ]
    util.test_lexicon(lexicon, testwords)

    if verbose:
        log.info("OK, read.")
    return lexicon
Exemplo n.º 5
0
def read_sensaldo(tsv, verbose=True):
    """Read the TSV version of the sensaldo lexicon (sensaldo-base.txt).

    Return a lexicon dictionary: {senseid: (class, ranking)}
    """
    if verbose:
        log.info("Reading TSV lexicon")
    lexicon = {}

    f = tsv.read()
    # with open(tsv) as f:
    for line in f.split("\n"):
        if line.lstrip():
            if line.startswith("#"):
                continue
            saldoid, label = line.split()
            lexicon[saldoid] = label

    testwords = ["förskräcklig..1", "ödmjukhet..1", "handla..1"]
    util.test_lexicon(lexicon, testwords)

    if verbose:
        log.info("OK, read")
    return lexicon
Exemplo n.º 6
0
def read_lmf(xml,
             annotation_elements=("gf", "lem", "saldo"),
             tagset="SUC",
             verbose=True):
    """Read the XML version of SALDO's morphological lexicon (saldom.xml).

    Return a lexicon dictionary, {wordform: {{annotation-type: annotation}: ( set(possible tags), set(tuples with following words) )}}
     - annotation_element is the XML element for the annotation value (currently: 'gf' for baseform, 'lem' for lemgram or 'saldo' for SALDO id)
     - tagset is the tagset for the possible tags (currently: 'SUC', 'Parole', 'Saldo')
    """
    # assert annotation_element in ("gf", "lem", "saldo"), "Invalid annotation element"
    tagmap = util.tagsets.mappings["saldo_to_" + tagset.lower()]
    if verbose:
        log.info("Reading XML lexicon")
    lexicon = {}

    context = etree.iterparse(
        xml,
        events=("start",
                "end"))  # "start" needed to save reference to root element
    context = iter(context)
    event, root = next(context)

    for event, elem in context:
        if event == "end":
            if elem.tag == "LexicalEntry":
                annotations = HashableDict()

                for a in annotation_elements:
                    annotations[a] = tuple(x.text for x in elem.findall(a))

                pos = elem.findtext("pos")
                inhs = elem.findtext("inhs")
                if inhs == "-":
                    inhs = ""
                inhs = inhs.split()

                # Check the paradigm for an "x", meaning a multi-word expression with a required gap
                p = elem.findtext("p")
                x_find = re.search(r"_x(\d*)_", p)
                x_insert = x_find.groups()[0] if x_find else None
                if x_insert == "":
                    x_insert = "1"

                # Only vbm and certain paradigms allow gaps
                gap_allowed = (pos == "vbm"
                               or p in (u"abm_x1_var_än", u"knm_x_ju_ju",
                                        u"pnm_x1_inte_ett_dugg",
                                        u"pnm_x1_vad_än", u"ppm_x1_för_skull"))

                table = elem.find("table")
                multiwords = []

                for form in list(table):
                    word = form.findtext("wf")
                    param = form.findtext("param")

                    if param in ("frag", "c", "ci", "cm"):
                        # We don't use these wordforms, so skip
                        continue
                    elif param[-1].isdigit() and param[-2:] != "-1":
                        # Handle multi-word expressions
                        multiwords.append(word)
                        multipart, multitotal = param.split(":")[-1].split("-")
                        particle = bool(re.search(
                            r"vbm_.+?p.*?\d+_", p))  # Multi-word with particle

                        # Add a "*" where the gap should be
                        if x_insert and multipart == x_insert:
                            multiwords.append("*")

                        if multipart == multitotal:
                            lexicon.setdefault(multiwords[0], {}).setdefault(
                                annotations,
                                (set(), set(), gap_allowed, particle))[1].add(
                                    tuple(multiwords[1:]))
                            multiwords = []
                    else:
                        # Single word expressions
                        if param[-2:] == "-1":
                            param = param.rsplit(" ", 1)[0]
                            if pos == "vbm":
                                pos = "vb"
                        saldotag = " ".join([pos] + inhs + [param])
                        tags = tagmap.get(saldotag)
                        if tags:
                            lexicon.setdefault(word, {}).setdefault(
                                annotations,
                                (set(), set(), False, False))[0].update(tags)

            # Done parsing section. Clear tree to save memory
            if elem.tag in ["LexicalEntry", "frame", "resFrame"]:
                root.clear()

    testwords = [
        "äggtoddyarna", "Linköpingsbors", "katabatiska", "väg-", "formar",
        "in", "datorrelaterade"
    ]
    util.test_lexicon(lexicon, testwords)

    if verbose:
        log.info("OK, read")
    return lexicon
Exemplo n.º 7
0
def read_lmf(xml,
             annotation_elements=("writtenForm", "lemgram"),
             tagset="SUC",
             verbose=True,
             skip_multiword=False,
             translate_tags=True):
    """Read the XML version of a morphological lexicon in lmf format (dalinm.xml, swedbergm.xml).

    Return a lexicon dictionary, {wordform: {{annotation-type: annotation}: ( set(possible tags), set(tuples with following words) )}}
    - annotation_element is the XML element for the annotation value, "writtenForm" for baseform, "lemgram" for lemgram
        writtenForm is translated to "gf" and lemgram to "lem" (for compatability with Saldo)
    - skip_multiword is a flag telling whether to make special entries for multiword expressions. Set this to False only if
        the tool used for text annotation cannot handle this at all
    """
    # assert annotation_element in ("writtenForm lemgram") "Invalid annotation element"
    if verbose:
        log.info("Reading XML lexicon")
    lexicon = {}
    tagmap = util.tagsets.mappings["saldo_to_" + tagset.lower()]

    context = etree.iterparse(
        xml,
        events=("start",
                "end"))  # "start" needed to save reference to root element
    context = iter(context)
    event, root = next(context)

    for event, elem in context:
        if event == "end":
            if elem.tag == "LexicalEntry":
                annotations = HashableDict()

                lem = elem.find("Lemma").find("FormRepresentation")
                for a in annotation_elements:
                    if a == "writtenForm":
                        key = "gf"
                    elif a == "lemgram":
                        key = "lem"
                    annotations[key] = tuple([findval(lem, a)])

                pos = findval(lem, "partOfSpeech")
                inhs = findval(lem, "inherent")
                if inhs == "-":
                    inhs = ""
                inhs = inhs.split()

                # there may be several WordForms
                for forms in elem.findall("WordForm"):
                    word = findval(forms, "writtenForm")
                    param = findval(forms, "msd")

                    multiwords = []
                    wordparts = word.split()
                    for i, word in enumerate(wordparts):
                        if (not skip_multiword) and len(wordparts) > 1:

                            # Handle multi-word expressions
                            multiwords.append(word)

                            # We don't use any particles or mwe:s with gaps since that information is not formally
                            # expressed in the historical lexicons
                            particle = False
                            mwe_gap = False  # but keep the fields so that the file format matches the normal saldo-pickle format

                            # is it the last word in the multi word expression?
                            if i == len(wordparts) - 1:
                                lexicon.setdefault(
                                    multiwords[0],
                                    {}).setdefault(annotations,
                                                   (set(), set(), mwe_gap,
                                                    particle))[1].add(
                                                        tuple(multiwords[1:]))
                                multiwords = []
                        else:
                            # Single word expressions
                            particle = False  # we don't use any particles or mwe:s with gaps
                            mwe_gap = False  # but keep the fields so that the file format match the normal saldo-pickle format

                            if translate_tags:
                                tags = convert_default(pos, inhs, param,
                                                       tagmap)
                                if tags:
                                    lexicon.setdefault(word, {}).setdefault(
                                        annotations,
                                        (set(), set(), mwe_gap,
                                         particle))[0].update(tags)
                            else:
                                saldotag = " ".join(
                                    [pos, param]
                                )  # this tag is rather useless, but at least gives some information
                                tags = tuple([saldotag])
                                lexicon.setdefault(word, {}).setdefault(
                                    annotations, (set(), set(), mwe_gap,
                                                  particle))[0].update(tags)

            # Done parsing section. Clear tree to save memory
            if elem.tag in ["LexicalEntry", "frame", "resFrame"]:
                root.clear()
    if verbose:
        testwords = ["äplebuske", "stöpljus", "katt"]
        util.test_lexicon(lexicon, testwords)
        log.info("OK, read")
    return lexicon