def check_morphemes(wordlist): #This checks for every morpheme in morphemes whether it corresponds to more than one crossIDs and outputs those cases. morphemes = defaultdict(list) for idx, doculect, morps, crossids in wordlist.iter_rows( 'doculect', 'morphemes', 'crossids'): for morp, crossid in zip(bt.lists(morps), bt.ints(crossids)): morphemes[doculect, morp] += [(idx, str(crossid))] for (doc, morp), values in sorted(morphemes.items(), key=lambda x: x[0]): crossids = [x[1] for x in values] if len(set(crossids)) != 1: print('# {0} / {1}'.format(doc, morp)) table = [] for idx, crossid in values: table += [[ idx, wordlist[idx, 'doculect'], wordlist[idx, 'concept'], bt.lists(wordlist[idx, 'morphemes']), morp, crossid ]] print( tabulate(table, headers=[ 'id', 'doculect', 'concept', 'morpheme', 'morphemes', 'crossid' ], tablefmt='pipe')) input()
class Tests(): string1 = '1 2 3 + 1 2 3' string2 = '1 2 3 1 2 3' l = lists(string1) assert len(l) == 7 assert len(l.n) == 2 assert str(l) == string1 i = ints(string2) assert i[0] == 1 assert str(i) == string2 f = floats(string2) assert float(i[0]) == f[0] assert ' '.join([str(fl).split('.')[0] for fl in f]) == string2 assert str(f).split()[0].startswith('1.') s = strings(string1) assert str(s) == string1 # check for types s = strings('1 2 3') assert str(s + s) == '1 2 3 1 2 3' i = ints('1 2 3') assert str(i + [1, 2, 3]) == '1 2 3 1 2 3' # list check assert str(lists('b a + b a') + 'm a') == 'b a + b a + m a' # append app = strings('1 2 3') app.append('4') assert str(app) == '1 2 3 4' app = ints('1 2 3') app.extend('4 5') assert str(app) == '1 2 3 4 5' assert_raises(ValueError, lambda x: strings(x).append('2 3'), '1 2') app = lists('1 2 3 + 1 2') app.extend('1 2') assert str(app) == '1 2 3 + 1 2 + 1 2' app = strings('1 2 3') app[1] = 2 assert app[1] == '2'
def check_rootids(wordlist): #this checks for every crossID whether it corresponds to more than one rootID and outputs those cases. for idx in wordlist: for c in ['crossids', 'rootids']: wordlist[idx, c] = bt.ints(wordlist[idx, c]) etd_cross = wordlist.get_etymdict(ref='crossids') etd_root = wordlist.get_etymdict(ref='rootids') for key, values in etd_cross.items(): data = [] for v in values: if v: for idx in v: crossids = wordlist[idx, 'crossids'] crossidx = crossids.index(key) try: rootid = wordlist[idx, 'rootids'][crossidx] data += [(idx, crossidx, rootid)] except IndexError: print('[!] error in rootids', idx, wordlist[idx, 'concept'], wordlist[idx, 'doculect'], crossids, wordlist[idx, 'rootids']) input() rootids = [x[2] for x in data] if len(set(rootids)) != 1: print('# crossid {0}'.format(key)) table = [] for idx, crossidx, rootid in data: table += [[ idx, wordlist[idx, 'doculect'], wordlist[idx, 'concept'], bt.lists(wordlist[idx, 'tokens']), bt.lists(wordlist[idx, 'tokens']).n[crossidx], crossidx, rootid ]] print( tabulate(table, headers=[ 'id', 'doculect', 'concept', 'tokens', 'morpheme', 'crossidx', 'rootid' ], tablefmt='pipe')) input()
def check_tokens(wordlist): #This checks for every morpheme in morphemes whether it corresponds to more than one morpheme in tokens and outputs those cases. morphemes = defaultdict(list) for idx, doculect, morps, toks in wordlist.iter_rows( 'doculect', 'morphemes', 'tokens'): for morp, tok in zip(bt.lists(morps), bt.lists(toks).n): morphemes[doculect, morp] += [(idx, str(tok))] for (doc, morp), values in sorted(morphemes.items(), key=lambda x: x[0]): toks = [x[1] for x in values] if len(set(toks)) != 1: print('# {0} / {1}'.format(doc, morp)) table = [] for idx, tok in values: table += [[idx, tok, ' '.join(wordlist[idx, 'tokens'])]] print( tabulate(table, headers=['idx', 'token', 'tokens'], tablefmt='pipe')) input()
def word_families(wordlist, morphemes='morphemes'): wf = defaultdict(list) for idx, tokens, morphemes, rootids in wordlist.iter_rows( 'tokens', 'morphemes', 'rootids'): tokens, morphemes, rootids = (bt.lists(tokens), bt.strings(morphemes), bt.ints(rootids)) wordlist[idx, 'tokens'] = tokens wordlist[idx, 'morphemes'] = morphemes wordlist[idx, 'rootids'] = rootids for tok, morph, rootid in zip(tokens.n, morphemes, rootids): if not morph.startswith('_'): wf[rootid] += [(idx, str(tok), morph)] for rootid, vals in sorted(wf.items(), key=lambda x: len(x[1])): print('# ROOT {0}'.format(rootid)) table = [] for (idx, tok, morph) in sorted(vals, key=lambda x: x[1]): table += [[ idx, wordlist[idx, 'doculect'], wordlist[idx, 'concept'], tok, morph, wordlist[idx, 'tokens'], wordlist[idx, 'morphemes'], ]] print( tabulate(table, headers=[ 'ID', 'Doculect', 'Concept', 'RootForm', 'RootConcept', 'Tokens', 'Morphemes' ], tablefmt='pipe')) print('')