Пример #1
0
 def test_find_referent(self):
     assert xp.find(xc3, '//tier[@type="words"]/referent()') == xc3[0][0]
     assert xp.find(xc3, '//tier[@type="words"]/referent("alignment")') == None
     assert xp.find(xc3, '//tier[@type="words"]/referent("segmentation")') == xc3[0][0]
     assert xp.find(xc3, '//item[../@type="words"]/referent()') == xc3[0][0][0]
     assert xp.findall(xc3, '//item[../@type="words"]/referent()') == [xc3[0][0][0], xc3[0][0][0], xc3[0][0][0]]
     assert xp.findall(xc3, '//item[../@type="words"]/referent("alignment")') == []
     assert xp.findall(xc3, '//item[../@type="words"]/referent("segmentation")') == [xc3[0][0][0], xc3[0][0][0], xc3[0][0][0]]
Пример #2
0
 def test_find_referrer(self):
     assert xp.find(xc3, '//tier[@type="phrases"]/referrer()') == xc3[0][5]  # because "alignment" comes before "segmentation"
     assert xp.findall(xc3, '//tier[@type="phrases"]/referrer()') == [xc3[0][5], xc3[0][1]]
     assert xp.find(xc3, '//tier[@type="phrases"]/referrer("segmentation")') == xc3[0][1]
     assert xp.find(xc3, '//tier[@type="phrases"]/referrer("alignment")') == xc3[0][5]
     assert xp.find(xc3, '//item[../@type="phrases"]/referrer()') == xc3[0][5][0]
     assert xp.findall(xc3, '//item[../@type="phrases"]/referrer()') == [xc3[0][5][0], xc3[0][1][0], xc3[0][1][1], xc3[0][1][2]]
     assert xp.findall(xc3, '//item[../@type="phrases"]/referrer("alignment")') == [xc3[0][5][0]]
     assert xp.findall(xc3, '//item[../@type="words"]/referrer("segmentation")') == [xc3[0][2][0], xc3[0][2][1], xc3[0][2][2], xc3[0][2][3], xc3[0][2][4], xc3[0][2][5]]
Пример #3
0
def remove_language_name(items, igt):
    new_items = []
    lgcode = xp.find(igt, LANG_CODE_PATH)
    lgname = xp.find(igt, LANG_NAME_PATH)
    lgtoks = []
    if lgcode and '?' not in lgcode and '*' not in lgcode:
        codes = set(lgcode.split(':'))  # split up complex codes
        codes.update(map(str.upper, list(codes)))
        codes.update(map(str.lower, list(codes)))
        lgtoks.extend(codes)
    if lgname and '?' not in lgname:
        lgtoks.append(lgname)
        lgtoks.append(lgname.upper())
        if re.search('[- ]', lgname, re.U):  # abbreviation for multiword names
            lgtoks.append(''.join(ln[0]
                                  for ln in re.split(r'[- ]+', lgname, re.U)))
        if re.search(r'^\w{3}', lgname, re.U):
            lgtoks.append(lgname[:3])
    if lgtoks:
        sig = '|'.join(re.escape(t) for t in lgtoks)
        start_lg_re = re.compile(r'^\s*[(\[]?({})[)\]]?'.format(sig), re.U)
        end_lg_re = re.compile(r'[(\[]?({})[)\]]?\s*$'.format(sig), re.U)
        for item in items:
            new_items.append(item)  # add now; might be modified later
            tags = get_tags(item)
            if tags[0] != 'M':
                orig = item.text
                m = start_lg_re.match(item.text)
                if m:
                    meta_item = Item(id=item.id,
                                     text=m.group(0).strip(),
                                     attributes=dict(item.attributes))
                    meta_item.attributes['tag'] = 'M+LN'
                    new_items.append(meta_item)
                    item.text = start_lg_re.sub(whitespace, item.text)
                m = end_lg_re.search(item.text)
                if m:
                    meta_item = Item(id=item.id,
                                     text=m.group(0).strip(),
                                     attributes=dict(item.attributes))
                    meta_item.attributes['tag'] = 'M+LN'
                    items.append(meta_item)
                    item.text = end_lg_re.sub(whitespace, item.text).rstrip()
                if 'LN' in tags and item.text != orig:
                    tags.remove('LN')
                    item.attributes['tag'] = '+'.join(tags)
    else:
        new_items = items
    return new_items
Пример #4
0
 def get_igts(self, corpus_id, ids=None, matches=None):
     igts = map(xigtjson.decode_igt, self._read_igts(corpus_id, ids=ids))
     if matches is not None:
         # matches are a disjunction (only one has to match)
         matcher = lambda i: any(xp.find(i, m) is not None for m in matches)
         igts = filter(matcher, igts)
     return list(igts)
Пример #5
0
def make_sortkey(sortkeys):
    # return int values if possible (for int comparison), otherwise strings
    def safe_int(x):
        try:
            return int(x)
        except ValueError:
            return x
    key = lambda x: [k for sk in sortkeys
                     for k in map(safe_int,
                                  re.split(r'(\d+)', xp.find(x, sk) or ''))]
    return key
Пример #6
0
 def test_find_simple_path(self):
     assert xp.find(xc1, '/igt') == xc1[0]
     assert xp.find(xc1, '/igt/tier') == xc1[0][0]
     assert xp.find(xc1, '/igt/tier/item') == xc1[0][0][0]
     assert xp.find(xc1, 'igt/tier/item') == xc1[0][0][0]
     assert xp.find(xc1, 'tier/item') == None
     assert xp.find(xc1[0], 'tier/item') == xc1[0][0][0]
Пример #7
0
 def test_find_node(self):
     assert xp.find(xc1, 'igt') == xc1[0]
     assert xp.find(xc1, 'tier') == None
     assert xp.find(xc1[0], 'tier') == xc1[0][0]
     assert xp.find(xc1, 'item') == None
     assert xp.find(xc1[0], 'item') == None
     assert xp.find(xc1[0][0], 'item') == xc1[0][0][0]
Пример #8
0
 def test_find_metadata(self):
     assert xp.find(xc1m, 'igt/metadata') == xc1m[0].metadata[0]
     assert xp.findall(xc1m, 'igt/metadata') == [xc1m[0].metadata[0]]
     assert xp.find(xc1m, 'igt/metadata/meta') == xc1m[0].metadata[0][0]
     assert xp.findall(xc1m, 'igt/metadata/meta') == [xc1m[0].metadata[0][0]]
     assert xp.find(xc1m, 'igt/metadata/meta/*') == xc1m[0].metadata[0][0][0]
     assert xp.findall(xc1m, 'igt/metadata/meta/*') == [xc1m[0].metadata[0][0][0], xc1m[0].metadata[0][0][1]]
     assert xp.find(xc1m, 'igt/metadata/meta/dc:subject') == xc1m[0].metadata[0][0][0]
     assert xp.find(xc1m, 'igt/metadata//dc:subject') == xc1m[0].metadata[0][0][0]
     assert xp.find(xc1m, 'igt/metadata/meta/dc:subject/@olac:code') == 'jpn'
     assert xp.find(xc1m, 'igt/metadata/meta/dc:subject/text()') == 'Japanese'
     assert xp.findall(xc1m, 'igt/metadata/meta/dc:*/@olac:code') == ['jpn', 'eng']
Пример #9
0
 def test_predicate(self):
     assert xp.find(xc1, '//tier[@type="phrases"]') == xc1[0][0]
     assert xp.find(xc1, '//tier[@type="translations"]') == xc1[0][1]
     assert xp.find(xc1, '//tier[@type="phrases"]/item') == xc1[0][0][0]
     assert xp.find(xc1, '//item[../@type="translations"]') == xc1[0][1][0]
     assert xp.find(xc3, '//item[../@type="glosses"][value()="NOM"]') == xc3[0][3][1]
Пример #10
0
 def find_descendants(self):
     assert xp.find(xc1, '//item') == xc1[0][0][0]
     assert xp.find(xc1[0], './/item') == xc1[0][0][0]
     assert xp.find(xc1[0][1], './/item') == xc1[0][1][0]
     assert xp.find(xc1[0][1], '//item') == xc1[0][0][0]
     assert xp.find(xc1m, '//meta') == xc1m[0].metadata[0][0]
Пример #11
0
 def test_find_relative(self):
     assert xp.find(xc1, '.') == xc1
     assert xp.find(xc1[0], '.') == xc1[0]
     assert xp.find(xc1[0], '..') == xc1
     assert xp.find(xc1[0], '../.') == xc1
Пример #12
0
 def test_find_root(self):
     assert xp.find(xc1, '/.') == xc1
     assert xp.find(xc1[0], '/.') == xc1
     assert xp.find(xc1[0][0], '/.') == xc1
Пример #13
0
def wordlist(filelist, gloss=None, meta=None):
    """
    This function takes a list of Xigt-XML ODIN files, looks for the
    'normalized' ODIN tier, and grabs the contents of all gloss and
    meta lines. It tokenizes simply by matching all word characters
    (using regex's `\w` escape) so as to pull out hyphenated and dotted
    gloss line tokens.

    The output is returned as a wordlist reverse sorted by count.

    :param filelist: List of input files to process.
    :type filelist: list[str]
    :param gloss: Path to use for the output gloss wordlist.
    :type gloss: str
    :param meta: Path to use for the output meta wordlist.
    :type meta: str
    """
    gloss_words = defaultdict(int)
    meta_words  = defaultdict(int)

    # -------------------------------------------
    # Iterate over all the paths in the list of files.
    # -------------------------------------------
    for path in filelist:
        with open(path, 'r', encoding='utf-8') as f:
            # Load the XigtCorpus, using the transient mode (most memory efficient)
            xc = xigtxml.load(f, mode='transient')

            # Now, iterate over each `Igt` instance in each file,
            for igt in xc:
                # Use a xigtpath expression to find the `tier` item that is a child of this node,
                # with state="normalized" as an attribute.
                norm_tier = xigtpath.find(igt, './tier[@state="normalized"]')

                # Next, since the `tag` attribute can be G+CR or M+AC etc., grab all lines
                # with a tag that starts with the desired tag letter.
                gloss_lines = [item for item in norm_tier if item.attributes['tag'].startswith("G")]
                meta_lines  = [item for item in norm_tier if item.attributes['tag'].startswith("M")]

                # Define a local function to update the wordlists for gloss and meta
                # lines.
                def update_count(l_l, words):
                    for l in l_l:
                        if l.value():
                            for w in l.value().split():
                                for sub_w in re.findall('[\w]+', w):  # <-- tokenize
                                    if sub_w.strip():
                                        words[sub_w.lower()] += 1 # <-- lowercase, and add

                # Update the counts.
                update_count(gloss_lines, gloss_words)
                update_count(meta_lines, meta_words)

    # Define a function to write out the wordlist objects to files.
    # here, we will reverse sort by frequency of the word, and
    # tab-delineate the columns.
    def write_items(words, path):
        if path:
            f = open(path, 'w', encoding='utf-8')
            items = sorted(words.items(), key=lambda x: (x[1], x[0]), reverse=True)
            for w, count in items:
                f.write('{}\t{}\n'.format(w, count))
            f.close()

    write_items(gloss_words, gloss)
    write_items(meta_words, meta)
Пример #14
0
def index(fn, by, idx):
    xc = xigtxml.load(fn, mode='transient')
    for i, igt in enumerate(xc):
        idx_key = xp.find(igt, by)
        idx[idx_key][fn].add(i)
Пример #15
0
 def test_text(self):
     assert xp.find(xc1, '//item/text()') == 'inu=ga san-biki hoe-ru'
Пример #16
0
 def test_value(self):
     assert xp.find(xc3, '//tier[@type="words"]/item/value()') == 'inu=ga'
Пример #17
0
 def test_disjunction(self):
     assert xp.find(xc1, '(/igt/tier[@type="phrases"] | /igt/tier[@type="translations"])') == xc1[0][0]
     assert xp.findall(xc1, '(/igt/tier[@type="phrases"] | /igt/tier[@type="translations"])') == [xc1[0][0], xc1[0][1]]
     assert xp.find(xc1, 'igt/(tier[@type="phrases"] | tier[@type="translations"])') == xc1[0][0]
     assert xp.findall(xc1, 'igt/(tier[@type="phrases"] | tier[@type="translations"])') == [xc1[0][0], xc1[0][1]]
     assert xp.findall(xc1, 'igt/(tier[@type="phrases"] | tier[@type="translations"])/item') == [xc1[0][0][0], xc1[0][1][0]]