def cldf(dataset, concepticon, **kw): for dset, srckey in zip(DSETS, SOURCES): wl = lp.Wordlist(dataset.raw.joinpath(dset).as_posix()) src = getEvoBibAsSource(srckey) with CldfDataset(('ID', 'Language_ID', 'Language_name', 'Language_iso', 'Parameter_ID', 'Parameter_name', 'Value', 'Source', 'Segments', 'CLPA', 'Cognacy', 'Partial_cognacy'), dataset, subset=dset.split('-')[0]) as ds: ds.sources.add(src) for k in wl: ds.add_row([ '{0}-{1}'.format(srckey, k), wl[k, 'glottolog'], wl[k, 'doculect'], '', wl[k, 'concepticon_id'], wl[k, 'concept'], wl[k, 'ipa'], srckey, ' '.join(wl[k, 'tokens']), ' '.join(wl[k, 'clpa']), wl[k, 'cogid'], ' '.join([str(x) for x in wl[k, 'partialids']]) ]) cognates = [] for k in wl: concept = wl[k, 'concept'] idf = '-'.join([slug(concept), '%s' % wl[k, 'cogid']]) cognates += [[ '{0}-{1}'.format(srckey, k), ds.name, wl[k, 'ipa'], idf, '', 'expert', srckey, '', '', '' ]] dataset.cognates.extend( iter_alignments(wl, cognates, method='progressive', prefix=srckey + '-'))
def cldf(dataset, concepticon, **kw): concepts = {x['GLOSS']: x['CONCEPTICON_ID'] for x in dataset.concepts} D = {} # dictionary to be passed to lingpy D[0] = [ 'doculect', 'glottolog', 'concept', 'concepticon', 'ipa', 'segments', 'cogid', 'alignment' ] idx = 1 for f in FILES: msa = lp.MSA( dataset.raw.joinpath('phonalign_{0}.msa'.format(f)).as_posix()) concept = msa.seq_id[1:-1] # strip quotation marks from concept cid = concepts.get(concept, '') for i, taxon in enumerate(msa.taxa): if taxon in languages: tid = languages[taxon] alignment = ' '.join(msa.alignment[i]) tokens = ' '.join([x for x in msa.alignment[i] if x != '-']) ipa = tokens.replace(' ', '') cogid = '{0}-{1}'.format(concept, f) D[idx] = [ taxon, tid, concept, cid, ipa, tokens, cogid, alignment ] idx += 1 with CldfDataset( ('ID', 'Language_ID', 'Language_name', 'Language_iso', 'Parameter_ID', 'Parameter_name', 'Value', 'Segments', 'Cognacy', 'Source'), dataset) as ds: src = getEvoBibAsSource('Heggarty2007') ds.sources.add(src) src = getEvoBibAsSource('List2014e') ds.sources.add(src) alm = lp.Alignments(D) for k in alm: ds.add_row( ['Heggarty2007-{0}'.format(k)] + [alm[k, x] or '' for x in ['glottolog', 'taxon', 'iso', 'concepticon', 'concept', 'ipa']] + \ [' '.join(alm[k, 'tokens']), alm[k, 'cogid'], 'Heggarty2007'] ) dataset.cognates += [[ 'Heggarty2007-{0}'.format(k), ds.name, alm[k, 'ipa'], alm[k, 'cogid'], '', 'expert', 'Heggarty2007', alm[k, 'alignment'], 'expert', 'List2014e' ]] dataset.write_cognates()
def cldf(dataset, concepticon, **kw): wl = lp.Alignments(dataset.raw.joinpath('tukano.tsv').as_posix()) src1 = getEvoBibAsSource('Chacon2014') src2 = getEvoBibAsSource('Chacon2015') gloss2conc = {r['GLOSS']: r['CONCEPTICON_ID'] for r in dataset.concepts} with CldfDataset(( 'ID', 'Language_ID', 'Language_name', 'Language_iso', 'Parameter_ID', 'Parameter_name', 'Value', 'Source', 'Segments', 'Cognacy', ), dataset) as ds: ds.sources.add(src1) ds.sources.add(src2) for k in wl: lid = wl[k, 'language'] cogid = wl[k, 'cogid'] concept = wl[k, 'concept'] segments = wl[k, 'tokens'] value = wl[k, 'ipa'] cogid = wl[k, 'cogid'] alignment = wl[k, 'alignment'] name, iso = abbr2lang[lid] concept = wl[k, 'concept'] cid = gloss2conc.get(concept) ds.add_row(('Chacon2014-' + str(k), dataset.glottocode_by_iso.get(iso, ''), name, iso, cid, concept, value, 'Chacon2014', ' '.join(segments), str(cogid))) cogid = '-'.join([slug(wl[k, 'concept']), '%s' % cogid]) dataset.cognates.append([ 'Chacon2014-' + str(k), ds.name, wl[k, 'ipa'], cogid, '', 'expert', 'Chacon2014', alignment, 'expert', 'Chacon2015' ])
def cldf(dataset, concepticon, **kw): concepticon = { c.english: c.concepticon_id for c in dataset.conceptlist.concepts.values() } language_map = { l['NAME']: l['GLOTTOCODE'] or None for l in dataset.languages } with UnicodeReader( dataset.raw.joinpath(FILENAME.replace('xls', 'Sheet1.csv'))) as r: rows = [row for row in r] concepts = [(i, rows[0][i].replace('_', ' ').strip()) for i in range(1, len(rows[0]), 2)] assert all(concept in concepticon for _, concept in concepts) with CldfDataset(('ID', 'Language_ID', 'Language_name', 'Parameter_ID', 'Parameter_name', 'Value', 'Segments', 'Source'), dataset) as ds: ds.table.schema.columns['Value']['dc:format'] = 'IPA' ds.sources.add(getEvoBibAsSource(SOURCE)) for row in rows[3:]: row = [col.strip() for col in row] if not row[0]: continue lname = row[0] for i, concept in concepts: for j, form in iterforms(row[i]): if form != '?' and form.strip(): ds.add_row([ '%s-%s-%s' % (slug(lname), (i + 1) // 2, j), language_map[lname], lname.replace('_', ' '), concepticon[concept], concept, form, ' '.join(clean_string(form)), SOURCE ]) # three methods: turchin, sca, lexstat, turchin is fast (needs not threshold) cognates = iter_cognates(ds, column='Segments', method='turchin', threshold=0.55) # two methods for alignments: progressive or library dataset.cognates.extend( iter_alignments(ds, cognates, column='Segments', method='progressive')) dataset.write_cognates()
def cldf(dataset, concepticon, **kw): gloss2con = {x['GLOSS']: x['CONCEPTICON_ID'] for x in dataset.concepts} lang2glot = {x['NAME']: x['GLOTTOCODE'] for x in dataset.languages} for dset, srckey in zip(DSETS, sources): wl = lp.Wordlist(dataset.raw.joinpath(dset).as_posix()) if 'tokens' not in wl.header: wl.add_entries('tokens', 'ipa', lp.ipa2tokens, merge_vowels=False, expand_nasals=True) src = getEvoBibAsSource(srckey) with CldfDataset(('ID', 'Language_ID', 'Language_name', 'Language_iso', 'Parameter_ID', 'Parameter_name', 'Value', 'Source', 'Segments', 'Cognacy', 'Loan'), dataset, subset=dset.split('.')[0]) as ds: ds.sources.add(src) errors = [] cognates = [] for k in wl: concept = wl[k, 'concept'] if '(V)' in concept: concept = concept[:-4] concept = correct_concepts.get(concept, concept) if concept not in gloss2con: errors += [concept] doculect = correct_languages.get(wl[k, 'doculect'], wl[k, 'doculect']) loan = wl[k, 'cogid'] < 0 cogid = abs(wl[k, 'cogid']) wid = '{0}-{1}'.format(dset.split('.')[0], k) ds.add_row([ wid, lang2glot[doculect], wl[k, 'doculect'], '', gloss2con.get(wl[k, 'concept'], ''), wl[k, 'concept'], wl[k, 'ipa'], srckey, ' '.join(wl[k, 'tokens'] or ['']), cogid, wl[k, 'loan'] ]) cognates.append([ wid, ds.name, wl[k, 'ipa'], cogid, 'borrowed' if loan else '', 'expert', srckey, '', '', '' ]) dataset.cognates.extend( iter_alignments(lp.Alignments(wl), cognates, method='library')) for er in sorted(set(errors)): print(er, dset)
def cldf(dataset, concepticon, **kw): with UnicodeReader(dataset.raw.joinpath('Wang2004.csv'), delimiter='\t') as reader: lines = list(reader) lmap = dict([(x['ABBREVIATION'], (x['GLOTTOCODE'], x['ISO'], x['NAME'])) for x in dataset.languages]) cmap = {c.english: c.concepticon_id for c in dataset.conceptlist.concepts.values()} with CldfDataset(( 'ID', 'Language_ID', 'Language_name', 'Language_iso', 'Parameter_ID', 'Parameter_name', 'Value', 'Source', 'Cognacy', ) , dataset) as ds: ds.sources.add(getEvoBibAsSource(SOURCE)) idx = 1 cogids = {0: 0} for i, line in enumerate(lines[1:]): concept = line[0] cid = cmap[concept] for t, cogs in zip(lines[0][1:], line[1:]): glottocode, iso, taxon = lmap[t] for cog in cogs.split('/'): if cog in cogids: cogid = cogids[cog] else: cogid = max(list(cogids.values()) or 0) + 1 cogids[cog] = cogid ds.add_row(( idx, glottocode, taxon, iso, cid, concept, cog, SOURCE, cogid)) dataset.cognates.append([ idx, ds.name, cog, '-'.join([slug(concept), str(cogid)]), '', 'expert', SOURCE, '', '', '']) idx += 1
def cldf(dataset, concepticon, **kw): wl = lp.Wordlist(dataset.raw.joinpath(DSET).as_posix()) gcode = {x['NAME']: x['GLOTTOCODE'] for x in dataset.languages} src = getEvoBibAsSource(SOURCE) with CldfDataset(('ID', 'Language_ID', 'Language_name', 'Language_iso', 'Parameter_ID', 'Parameter_name', 'Parameter_Chinese_name', 'Value', 'Segments', 'Source'), dataset) as ds: ds.sources.add(src) for k in wl: if wl[k, 'value'] not in '---' and wl[k, 'value'].strip(): ds.add_row([ wl[k, 'lid'], gcode[wl[k, 'doculect']], wl[k, 'doculect'], '', wl[k, 'concepticon_id'], wl[k, 'concept'], wl[k, 'chinese'], wl[k, 'value'], clean_string(wl[k, 'value'])[0], SOURCE ])
def cldf(dataset, concepticon, **kw): concepticon = { c.english: c.concepticon_id for c in dataset.conceptlist.concepts.values() } concepticon['you (sing.)'] = concepticon['you (sing.) (thou)'] concepticon['you (pl.)'] = concepticon['you (pl.) (ye)'] concepticon['to itch/itchy'] = concepticon['to itch/to be itchy'] concepticon['medicine'] = concepticon['medicine/juice'] concepticon['excrement/shit'] = concepticon['feces/excrement/shit'] language_map = { 'Tampuon': 'Tampuan', 'Palaung-Namhsan-Taunggyi': 'Palaung-Namhsan', 'Jru-Laven\u02d0': 'Jru-Laven', 'Pnar-Jaintia': 'Pnar', 'K-Surin': 'Khmer-Surin', } languages = {} words = [] with UnicodeReader(dataset.raw.joinpath('ds.Sheet1.csv')) as reader: for i, row in enumerate(reader): if 3 <= i < 125: languages[row[1]] = row elif i > 334: words.append(row) lids = [int(float(r[0])) for r in languages.values()] assert min(lids) == 1 and max(lids) == 122 glottolog = dataset.glottocode_by_iso glottolog.update( {l['NAME']: l['GLOTTOCODE'] or None for l in dataset.languages}) sources = {} for src, langs in groupby(sorted(languages.values(), key=lambda r: r[6]), lambda r: r[6]): langs = [l[1] for l in langs] src = Source('misc', '_'.join(map(slug, langs)), title=src) for lang in langs: sources[lang] = src sources['cognates'] = getEvoBibAsSource(SOURCE) unmapped = Unmapped() with CldfDataset(( 'ID', 'Language_ID', 'Language_name', 'Language_iso', 'Parameter_ID', 'Parameter_name', 'Value', 'Segments', 'Source', 'Comment', ), dataset) as ds: ds.sources.add(*sources.values()) D = {0: ['lid', 'doculect', 'concept', 'ipa', 'tokens', 'cog']} for i, row in enumerate(words): form = row[4] if not form or form in '*-': continue assert row[1] in concepticon lang = language_map.get(row[3], row[3].strip()) assert lang in languages gc = glottolog.get(glottolog.get(languages[lang][7]), lang) if not gc: unmapped.languages.add(('', lang, languages[lang][7])) # get segments segments = clean_string(form)[0] # get cognate identifier cogid = row[5] if row[5].strip() and row[5].strip() != '*' else ( 'e%s' % i) cogid = row[1] + '-' + cogid lid = '{0}-{1}'.format(ds.name, i + 1) ds.add_row([ lid, glottolog.get(lang, glottolog.get(languages[lang][7])), lang, languages[lang][7], concepticon[row[1]], row[1], form, segments, sources[lang].id, None ]) D[i + 1] = [lid, lang, row[1], form, segments, cogid] wl = lp.Wordlist(D) wl.renumber('cog') alm = lp.Alignments(wl) dataset.cognates.extend( iter_alignments(alm, wordlist2cognates(wl, ds, SOURCE))) unmapped.pprint()
def cldf(dataset, concepticon, **kw): concepticon = { c.english: c.concepticon_id for c in dataset.conceptlist.concepts.values() } language_map = { l['NAME']: l['GLOTTOCODE'] or None for l in dataset.languages } header, rows = None, [] with UnicodeReader( dataset.raw.joinpath( 'Semitic.Wordlists.ActualWordlists.csv')) as reader: for i, row in enumerate(reader): row = [c.strip() for c in row] if i == 0: header = row if i > 0: rows.append(row) cheader, crows = None, [] with UnicodeReader( dataset.raw.joinpath( 'Semitic.Codings.Multistate.Sheet1.csv')) as reader: for i, row in enumerate(reader): row = [c.strip() for c in row] if i == 0: cheader = row if i > 0: crows.append(row) langs = header[1:] clean_langs = { """Gɛ'ɛz""": "Ge'ez", "Tigrɛ": "Tigre", 'ʷalani': "Walani", "Ogadɛn Arabic": "Ogaden Arabic", "Mɛhri": "Mehri", "Gibbali": "Jibbali", } correct_concepts = { 'Cold (air)': 'Cold (of air)', } src = getEvoBibAsSource('Kitchen2012') with CldfDataset(('ID', 'Language_ID', 'Language_name', 'Parameter_ID', 'Parameter_name', 'Value', 'Segments'), dataset) as ds: D = {0: ['doculect', 'concept', 'ipa', 'tokens']} idx = 1 ds.sources.add(src) for row in rows: concept = row[0] for i, col in enumerate(row[1:]): lang = langs[i] if col != '---': cleaned_string = clean_string(col, merge_vowels=False, preparse=PREPARSE, rules=CONVERSION, semi_diacritics='')[0] ds.add_row([ 'Kitchen2012-' + str(idx), language_map[lang], clean_langs.get(lang, lang), concepticon[concept], concept, col, cleaned_string ]) D[idx] = [ clean_langs.get(lang, lang), concept, col, cleaned_string ] idx += 1 wl = lp.Wordlist(D) id2cog = {} errors = [] for row in crows: taxon = row[0] for i, (concept, cog) in enumerate(zip(cheader[1:], row[1:])): nconcept = rows[i][0] if cog != '-': idxs = wl.get_dict(taxon=taxon) if idxs.get(nconcept, ''): id2cog[idxs[nconcept][0]] = concept + '-' + cog else: errors += [(concept, nconcept, taxon)] bad_cogs = 1 cognates = [] for k in wl: cognates = [] if k in id2cog: cogid = id2cog[k] else: cogid = str(bad_cogs) bad_cogs += 1 id2cog[k] = cogid wl.add_entries('cog', id2cog, lambda x: x) wl.renumber('cog') for k in wl: cognates += [[ 'Kitchen2012-' + str(k), ds.name, wl[k, 'ipa'], wl[k, 'concept'] + '-' + str(wl[k, 'cogid']), '', 'expert', 'Kitchen2012', '', '', '' ]] dataset.cognates.extend(iter_alignments(lp.Alignments(wl), cognates))
def cldf(dataset, concepticon, **kw): concept_map = { c.english: c.concepticon_id for c in dataset.conceptlist.concepts.values() } glotto_map = {c['NAME']: c['GLOTTOCODE'] for c in dataset.languages} # retrieve coordinates coords = {} langs = [] # language map, as the names are not identical language_map = { "Namhsan": "Nam Hsan", "Pangkham": "Pang Kham", "Xiang Zhai Tang (Xiang Cai Tang)": "Xiang Zhai Tang" } with UnicodeReader( dataset.raw.joinpath('100item-phylo.Sheet2.csv')) as reader: for i, (num, lat, lon, village, country) in enumerate(reader): if i >= 1: coords[language_map.get(village, village)] = (lat, lon) langs.append(language_map.get(village, village)) cognates = [] idx = 1 with UnicodeReader(dataset.raw.joinpath('100item-phylo.Sheet1.csv'), delimiter=',') as reader,\ CldfDataset(( 'ID', 'Language_ID', 'Language_name', 'Language_iso', 'Parameter_ID', 'Parameter_name', 'Value', 'Source', 'Segments', 'Cognacy' ), dataset) as ds: ds.sources.add(getEvoBibAsSource('Deepadung2015')) ds.metadata['coordinates'] = coords data = list(reader) header = data[2][2:] for i, row in enumerate(data[5:]): row = [c.strip() for c in row] concept = row[1] cid = concept_map[concept] for j in range(0, len(header), 2): lang = language_map.get(header[j], header[j]) gcid = glotto_map[lang] cog = slug(concept) + '-' + row[2:][j + 1] certainty = 0 if ' or ' in cog: cog = cog.split(' ')[0] certainty = 1 word = CORRECT.get(row[2:][j], row[2:][j]) if word.strip() and ''.join(set(word.strip())) != '-': segments = lp.sequence.sound_classes.clean_string( word, splitters=',', rules=CONVERSION, preparse=PREPARSE, semi_diacritics="")[0] cogid = slug(concept) + '-' + cog ds.add_row([ idx, gcid, lang, '', cid, concept, word, PROVIDER, segments, cogid ]) cognates.append([ idx, ds.name, word, cogid, str(certainty), 'expert', PROVIDER, '', '', '' ]) idx += 1 dataset.cognates.extend( iter_alignments( ds, cognates, method='progressive', ))
def cldf(dataset, concepticon, **kw): wl = lp.Wordlist(dataset.raw.joinpath(DSET).as_posix()) gcode = {x['NAME']: x['GLOTTOCODE'] for x in dataset.languages} ccode = {x.english: x.concepticon_id for x in dataset.conceptlist.concepts.values()} src = getEvoBibAsSource(SOURCE) src2 = getEvoBibAsSource('List2015d') with CldfDataset(( 'ID', 'Language_ID', 'Language_name', 'Language_iso', 'Parameter_ID', 'Parameter_name', 'Value', 'Source', 'Cognacy', ) , dataset) as ds: ds.sources.add(src, src2) # store list of proto-form to cognate set p2c = {} for k in wl: ds.add_row([ '{0}-{1}'.format(SOURCE, k), gcode[wl[k, 'doculect']], wl[k, 'doculect'], '', ccode[wl[k, 'concept']], wl[k, 'concept'], wl[k, 'ipa'], SOURCE, wl[k, 'COGID'] ]) dataset.cognates += [[ '{0}-{1}'.format(SOURCE, k), ds.name, wl[k, 'ipa'], '-'.join([slug(wl[k, 'concept']), str(wl[k, 'cogid'])]), '', 'expert', SOURCE, '', '', '' ]] p2c[wl[k, 'proto']] = wl[k, 'cogid'] idx = max([k for k in wl]) + 1 for line in lp.csv2list(dataset.raw.joinpath('old_chinese.csv').as_posix()): for val in line[1].split(', '): ds.add_row(( '{0}-{1}'.format(SOURCE, idx), 'sini1245', 'Old Chinese', '', ccode[line[0]], line[0], val, SOURCE, p2c.get(val, val) )) dataset.cognates += [[ '{0}-{1}'.format(SOURCE, idx), ds.name, val, '-'.join([slug(line[0]), text_type(p2c.get(val, val))]), '', 'expert', SOURCE, '', '', '']] idx += 1
def cldf(dataset, concepticon, **kw): language_map = { l['NAME']: l['GLOTTOCODE'] or None for l in dataset.languages } concept_map = { c.english: c.concepticon_id for c in dataset.conceptlist.concepts.values() } concept_map[ 'year'] = '1226' # dunno why this is missing, it's 200 words... wordlists = list(read_csv(dataset)) cogsets = defaultdict(lambda: defaultdict(list)) for wl in wordlists: for concept, (words, cogids) in wl.words.items(): if len(cogids) == 1: cogsets[concept][cogids[0]].append(words[0]) with CldfDataset(( 'ID', 'Language_ID', 'Language_name', 'Parameter_ID', 'Parameter_name', 'Value', 'Segments', 'Source', 'Comment', ), dataset) as ds: ds.sources.add(getEvoBibAsSource(SOURCE)) cognates = [] for wl in wordlists: #print(wl.language) for concept, (words, cogids) in wl.words.items(): if len(cogids) > 1: if len(words) < len(cogids): if len(words) == 1: if ':' in words[0]: words = words[0].split(':') if ',' in words[0]: words = words[0].split(',') assert len(words) >= len(cogids) assert (wl.language, concept) in COGSET_MAP if len(words) > len(cogids): assert (wl.language, concept) in COGSET_MAP if (wl.language, concept) in COGSET_MAP: word_to_cogid = COGSET_MAP[(wl.language, concept)] else: word_to_cogid = dict(izip_longest(words, cogids)) for i, word in enumerate(words): if word.startswith('(') and word.endswith(')'): word = word[1:-1].strip() wid = '%s-%s-%s' % (slug( wl.language), slug(concept), i + 1) ds.add_row([ wid, '', wl.language, concept_map.get(concept, ''), concept, word, clean_string(word, splitters='?')[0], SOURCE, '', ]) if word_to_cogid.get(word): cognates.append([ wid, ds.name, word, '%s-%s' % (slug(concept), word_to_cogid[word]), False, 'expert', SOURCE, '', '', '', ]) dataset.cognates.extend( iter_alignments(ds, cognates, column='Segments'))
def cldf(dataset, concepticon, **kw): """ Implements the conversion of the raw data to CLDF dataset(s). :param dataset: provides access to the information in supplementary files as follows:\ - the JSON object from `metadata.json` is available as `dataset.md`\ - items from languages.csv are available as `dataset.languages`\ - items from concepts.csv are available as `dataset.concepts`\ - if a Concepticon conceptlist was specified in metadata.json, its ID is available\ as `dataset.conceptlist` :param glottolog: a pyglottolog.api.Glottolog` instance. :param concepticon: a pyconcepticon.api.Concepticon` instance. :param kw: All arguments passed on the command line. """ wl = lp.Wordlist(dataset.raw.joinpath(DSET).as_posix()) # get language identifiers lids, cids, coords = {}, {}, {} for row in dataset.languages: language = row['NAME'] lids[language] = row['GLOTTOCODE'] coords = dict([wl.coords[taxon] for taxon in lids]) modify = { 'thunder (verb)': 'thunder', 'flash (verb)': 'lightning', 'room': 'flat', 'have diarrea': 'have diarrhoea', 'watery': 'light' } for row in dataset.concepts: concept = modify[row['CONCEPT']] if row['CONCEPT'] in modify else \ row['CONCEPT'] cids[concept] = row['CONCEPT_SET'] # language ids src = getEvoBibAsSource(SOURCE) src2 = getEvoBibAsSource('List2014b') # get partial identifiers partial_ids = defaultdict(list) partial_converter = {} idx = 1 for k in wl: for char in wl[k, 'counterpart']: if char in partial_converter: pidx = partial_converter[char] else: pidx = idx partial_converter[char] = idx idx += 1 partial_ids[k] += [pidx] # trace if proto-langugages was visited visited = [] idx = max([k for k in wl]) + 1 with CldfDataset( ('ID', 'Language_ID', 'Language_name', 'Language_iso', 'Parameter_ID', 'Parameter_name', 'Parameter_Chinese_name', 'Value', 'Value_Chinese_characters', 'Source', 'Segments', 'Cognacy', 'Rank', 'Comment'), dataset) as ds: ds.sources.add(src) ds.sources.add(src2) D = {0: ['doculect', 'concept', 'ipa', 'tokens', 'cogid']} for k in wl: tokens = lp.ipa2tokens(wl[k, 'ipa'], merge_vowels=False, expand_nasals=True) # remove sandhi-annotation in tokens, as it is confusing clpa for i, t in enumerate(tokens): if '⁻' in t: tokens[i] = t[:t.index('⁻')] ds.add_row([ '{0}-{1}'.format(SOURCE, k), lids[wl[k, 'doculect']], wl[k, 'doculect'], '', cids[wl[k, 'concept']], wl[k, 'concept'], wl[k, 'mandarin'], wl[k, 'ipa'], wl[k, 'counterpart'], SOURCE, ' '.join(tokens), wl[k, 'cogid'], wl[k, 'order'], wl[k, 'note'] if wl[k, 'note'] != '-' else '', ]) D[k] = [ wl[k, 'doculect'], wl[k, 'concept'], wl[k, 'ipa'], tokens, wl[k, 'cogid'] ] if wl[k, 'cogid'] not in visited: # we need to add new tones, otherwise it won't work, so we # split syllables first, then check if the syllable ends with # tone or not and add a '1' if this is not the case syllables = wl[k, 'mch'].split('.') for i, s in enumerate(syllables): if s[-1] not in '²³': if s[-1] not in 'ptk': syllables[i] += '¹' else: syllables[i] += '⁴' tokens = lp.ipa2tokens(''.join(syllables)) ds.add_row([ '{0}-{1}'.format(wl[k, 'concept'], idx), 'sini1245', 'Middle Chinese', '', cids[wl[k, 'concept']], wl[k, 'concept'], '', wl[k, 'proto'], wl[k, 'counterpart'], SOURCE, ' '.join(tokens), wl[k, 'cogid'], '', '' ]) D[idx] = [ 'Middle Chinese', wl[k, 'concept'], wl[k, 'mch'], tokens, wl[k, 'cogid'] ] idx += 1 visited += [wl[k, 'cogid']] alms = lp.Alignments(D) cognates = [[ '{0}-{1}'.format(SOURCE, k), ds.name, alms[k, 'ipa'], '-'.join([slug(alms[k, 'concept']), str(alms[k, 'cogid'])]), '', 'expert', SOURCE, '', '', '' ] for k in alms] dataset.cognates.extend( iter_alignments(alms, cognates, method='library'))