示例#1
0
def check_morphemes(wordlist):
    #This checks for every morpheme in morphemes whether it corresponds to more than one crossIDs and outputs those cases.
    morphemes = defaultdict(list)
    for idx, doculect, morps, crossids in wordlist.iter_rows(
            'doculect', 'morphemes', 'crossids'):
        for morp, crossid in zip(bt.lists(morps), bt.ints(crossids)):
            morphemes[doculect, morp] += [(idx, str(crossid))]

    for (doc, morp), values in sorted(morphemes.items(), key=lambda x: x[0]):
        crossids = [x[1] for x in values]
        if len(set(crossids)) != 1:
            print('# {0} / {1}'.format(doc, morp))
            table = []
            for idx, crossid in values:
                table += [[
                    idx, wordlist[idx, 'doculect'], wordlist[idx, 'concept'],
                    bt.lists(wordlist[idx, 'morphemes']), morp, crossid
                ]]
            print(
                tabulate(table,
                         headers=[
                             'id', 'doculect', 'concept', 'morpheme',
                             'morphemes', 'crossid'
                         ],
                         tablefmt='pipe'))
            input()
示例#2
0
class Tests():

    string1 = '1 2 3 + 1 2 3'
    string2 = '1 2 3 1 2 3'
    l = lists(string1)
    assert len(l) == 7
    assert len(l.n) == 2
    assert str(l) == string1

    i = ints(string2)
    assert i[0] == 1
    assert str(i) == string2

    f = floats(string2)
    assert float(i[0]) == f[0]
    assert ' '.join([str(fl).split('.')[0] for fl in f]) == string2
    assert str(f).split()[0].startswith('1.')

    s = strings(string1)
    assert str(s) == string1

    # check for types
    s = strings('1 2 3')
    assert str(s + s) == '1 2 3 1 2 3'
    i = ints('1 2 3')
    assert str(i + [1, 2, 3]) == '1 2 3 1 2 3'

    # list check
    assert str(lists('b a + b a') + 'm a') == 'b a + b a + m a'

    # append
    app = strings('1 2 3')
    app.append('4')
    assert str(app) == '1 2 3 4'

    app = ints('1 2 3')
    app.extend('4 5')
    assert str(app) == '1 2 3 4 5'

    assert_raises(ValueError, lambda x: strings(x).append('2 3'), '1 2')

    app = lists('1 2 3 + 1 2')
    app.extend('1 2')
    assert str(app) == '1 2 3 + 1 2 + 1 2'

    app = strings('1 2 3')
    app[1] = 2
    assert app[1] == '2'
示例#3
0
def check_rootids(wordlist):
    #this checks for every crossID whether it corresponds to more than one rootID and outputs those cases.
    for idx in wordlist:
        for c in ['crossids', 'rootids']:
            wordlist[idx, c] = bt.ints(wordlist[idx, c])

    etd_cross = wordlist.get_etymdict(ref='crossids')
    etd_root = wordlist.get_etymdict(ref='rootids')

    for key, values in etd_cross.items():
        data = []
        for v in values:
            if v:
                for idx in v:
                    crossids = wordlist[idx, 'crossids']
                    crossidx = crossids.index(key)
                    try:
                        rootid = wordlist[idx, 'rootids'][crossidx]
                        data += [(idx, crossidx, rootid)]
                    except IndexError:
                        print('[!] error in rootids', idx, wordlist[idx,
                                                                    'concept'],
                              wordlist[idx, 'doculect'], crossids,
                              wordlist[idx, 'rootids'])
                        input()
        rootids = [x[2] for x in data]
        if len(set(rootids)) != 1:
            print('# crossid {0}'.format(key))
            table = []
            for idx, crossidx, rootid in data:
                table += [[
                    idx, wordlist[idx, 'doculect'], wordlist[idx, 'concept'],
                    bt.lists(wordlist[idx, 'tokens']),
                    bt.lists(wordlist[idx, 'tokens']).n[crossidx], crossidx,
                    rootid
                ]]
            print(
                tabulate(table,
                         headers=[
                             'id', 'doculect', 'concept', 'tokens', 'morpheme',
                             'crossidx', 'rootid'
                         ],
                         tablefmt='pipe'))
            input()
示例#4
0
def check_tokens(wordlist):
    #This checks for every morpheme in morphemes whether it corresponds to more than one morpheme in tokens and outputs those cases.
    morphemes = defaultdict(list)
    for idx, doculect, morps, toks in wordlist.iter_rows(
            'doculect', 'morphemes', 'tokens'):
        for morp, tok in zip(bt.lists(morps), bt.lists(toks).n):
            morphemes[doculect, morp] += [(idx, str(tok))]

    for (doc, morp), values in sorted(morphemes.items(), key=lambda x: x[0]):
        toks = [x[1] for x in values]
        if len(set(toks)) != 1:
            print('# {0} / {1}'.format(doc, morp))
            table = []
            for idx, tok in values:
                table += [[idx, tok, ' '.join(wordlist[idx, 'tokens'])]]
            print(
                tabulate(table,
                         headers=['idx', 'token', 'tokens'],
                         tablefmt='pipe'))
            input()
示例#5
0
def word_families(wordlist, morphemes='morphemes'):

    wf = defaultdict(list)
    for idx, tokens, morphemes, rootids in wordlist.iter_rows(
            'tokens', 'morphemes', 'rootids'):
        tokens, morphemes, rootids = (bt.lists(tokens), bt.strings(morphemes),
                                      bt.ints(rootids))
        wordlist[idx, 'tokens'] = tokens
        wordlist[idx, 'morphemes'] = morphemes
        wordlist[idx, 'rootids'] = rootids

        for tok, morph, rootid in zip(tokens.n, morphemes, rootids):
            if not morph.startswith('_'):
                wf[rootid] += [(idx, str(tok), morph)]

    for rootid, vals in sorted(wf.items(), key=lambda x: len(x[1])):

        print('# ROOT {0}'.format(rootid))
        table = []
        for (idx, tok, morph) in sorted(vals, key=lambda x: x[1]):
            table += [[
                idx,
                wordlist[idx, 'doculect'],
                wordlist[idx, 'concept'],
                tok,
                morph,
                wordlist[idx, 'tokens'],
                wordlist[idx, 'morphemes'],
            ]]
        print(
            tabulate(table,
                     headers=[
                         'ID', 'Doculect', 'Concept', 'RootForm',
                         'RootConcept', 'Tokens', 'Morphemes'
                     ],
                     tablefmt='pipe'))
        print('')