예제 #1
0
파일: __init__.py 프로젝트: MacyL/cddb
def prepare(ds):
    errs = 0
    wl = Wordlist(ds.raw('bds.tsv'))
    W = {}
    for k in wl:
        value = wl[k, 'value']
        tokens = wl[k, 'tokens']
        doc = wl[k, 'doculect']
        if value:
            morphemes = []
            for a, b in _get_slices(wl[k, 'tokens']):
                ipa = ''.join(tokens[a:b])
                morphemes += [ipa]
            ipa = ' '.join(morphemes)

            clpa = ds.transform(ipa, 'CLPA')
            struc = ds.transform(ipa, 'Structure')
            try:
                assert len(clpa.split(' ')) == len(struc.split(' '))
            except:
                errs += 1
                print(errs, clpa, struc)
            if '«' in clpa:
                errs += 1
                print(errs, ipa, clpa, struc)
            W[k] = [
                doc, wl[k, 'concept'], wl[k, 'concepticon_id'], value, clpa,
                struc, wl[k, 'partial_ids']
            ]
    W[0] = [
        'doculect', 'concept', 'concepticon_id', 'value', 'segments',
        'structure', 'cogids'
    ]
    ds.write_wordlist(Wordlist(W))
예제 #2
0
파일: util.py 프로젝트: MacyL/cddb
def get_inventories(wordlist, segments='tokens'):
    assert segments in wordlist.header
    D = {t : defaultdict(list) for t in wordlist.taxa}
    for taxon in wordlist.taxa:
        for idx in wordlist.get_list(taxon=taxon, flat=True):
            tokens = wordlist[idx, segments]
            print(' '.join(tokens))
            slices = _get_slices(tokens)
            for jdx, (sA, sB) in enumerate(slices):
                i, m, n, f, t = sinopy.parse_chinese_morphemes(tokens[sA:sB])
                pos = '{0}:{1}'.format(idx, jdx)
                if i != '-':
                    D[taxon]['initial', i] += [pos]
                if m != '-':
                    D[taxon]['medial', m] += [pos]
                if n != '-':
                    D[taxon]['nucleus', n] += [pos]
                if f != '-':
                    D[taxon]['final', f] += [pos]
                if t != '-': 
                    D[taxon]['tone', t] += [pos]
    I = [('ID', 'DOCULECT', 'CONTEXT', 'VALUE', 'OCCURRENCES', 'CROSSREF')]
    idx = 1
    for t in wordlist.taxa:
        for (s, v), occ in sorted(D[t].items(), key=lambda x: (x[0][0], x[0][1],
            len(x[1]))):
            I += [(str(idx), t, s, v, len(occ), ' '.join(occ))]
    return I
예제 #3
0
 def test__get_slices(self):
     a = _get_slices(list('ba²te²'))
     b = _get_slices(list('ba²te²'), split_on_tones=False)
     assert a[0][1] == 3
     assert b[0][1] == 6
예제 #4
0
파일: profile.py 프로젝트: MacyL/cddb
from lingpy import *
from lingpy.compare.partial import _get_slices

wl = Wordlist('bds.tsv')
segments = set()
for k in wl:
    tokens = wl[k, 'tokens']
    #print(k, ' '.join(tokens))
    #if '(' in tokens:
    #    print(k)
    #    input()
    if ''.join(tokens):
        slices = _get_slices(tokens)
        for a, b in slices:
            this = tokens[a:b]
            classes = tokens2class(this, 'cv')
            if classes[0].lower() == 'c':
                ini, final = tokens[a:b][0], tokens[a:b][1:]
                segments.update([(' '.join(ini), 'i'), (' '.join(final), 'f')])
            else:
                segments.add((' '.join(this), 'f'))
for seg in segments:
    print(''.join(seg[0].split(' ')), '\t', seg[1])

예제 #5
0
 def test__get_slices(self):
     a = _get_slices(list('ba²te²'))
     b = _get_slices(list('ba²te²'), split_on_tones=False)
     assert a[0][1] == 3
     assert b[0][1] == 6
예제 #6
0
파일: util.py 프로젝트: MacyL/cddb
def slice_word(word):
    for a, b in _get_slices(word):
        yield word[a:b]