def align_cognates(self, alm=None, cognates=None, column='Segments', method='library'): from pylexibank.lingpy_util import iter_alignments iter_alignments(alm or self, cognates or self.objects['CognateTable'], column=column, method=method)
def cldf(dataset, concepticon, **kw): for dset, srckey in zip(DSETS, SOURCES): wl = lp.Wordlist(dataset.raw.joinpath(dset).as_posix()) src = getEvoBibAsSource(srckey) with CldfDataset(('ID', 'Language_ID', 'Language_name', 'Language_iso', 'Parameter_ID', 'Parameter_name', 'Value', 'Source', 'Segments', 'CLPA', 'Cognacy', 'Partial_cognacy'), dataset, subset=dset.split('-')[0]) as ds: ds.sources.add(src) for k in wl: ds.add_row([ '{0}-{1}'.format(srckey, k), wl[k, 'glottolog'], wl[k, 'doculect'], '', wl[k, 'concepticon_id'], wl[k, 'concept'], wl[k, 'ipa'], srckey, ' '.join(wl[k, 'tokens']), ' '.join(wl[k, 'clpa']), wl[k, 'cogid'], ' '.join([str(x) for x in wl[k, 'partialids']]) ]) cognates = [] for k in wl: concept = wl[k, 'concept'] idf = '-'.join([slug(concept), '%s' % wl[k, 'cogid']]) cognates += [[ '{0}-{1}'.format(srckey, k), ds.name, wl[k, 'ipa'], idf, '', 'expert', srckey, '', '', '' ]] dataset.cognates.extend( iter_alignments(wl, cognates, method='progressive', prefix=srckey + '-'))
def cldf(dataset, concepticon, **kw): concepticon = { c.english: c.concepticon_id for c in dataset.conceptlist.concepts.values() } language_map = { l['NAME']: l['GLOTTOCODE'] or None for l in dataset.languages } with UnicodeReader( dataset.raw.joinpath(FILENAME.replace('xls', 'Sheet1.csv'))) as r: rows = [row for row in r] concepts = [(i, rows[0][i].replace('_', ' ').strip()) for i in range(1, len(rows[0]), 2)] assert all(concept in concepticon for _, concept in concepts) with CldfDataset(('ID', 'Language_ID', 'Language_name', 'Parameter_ID', 'Parameter_name', 'Value', 'Segments', 'Source'), dataset) as ds: ds.table.schema.columns['Value']['dc:format'] = 'IPA' ds.sources.add(getEvoBibAsSource(SOURCE)) for row in rows[3:]: row = [col.strip() for col in row] if not row[0]: continue lname = row[0] for i, concept in concepts: for j, form in iterforms(row[i]): if form != '?' and form.strip(): ds.add_row([ '%s-%s-%s' % (slug(lname), (i + 1) // 2, j), language_map[lname], lname.replace('_', ' '), concepticon[concept], concept, form, ' '.join(clean_string(form)), SOURCE ]) # three methods: turchin, sca, lexstat, turchin is fast (needs not threshold) cognates = iter_cognates(ds, column='Segments', method='turchin', threshold=0.55) # two methods for alignments: progressive or library dataset.cognates.extend( iter_alignments(ds, cognates, column='Segments', method='progressive')) dataset.write_cognates()
def test_iter_cognates_and_alignments(dataset_cldf): assert not list( lingpy_util.iter_cognates( dataset_cldf.cldf_specs().get_writer(dataset=dataset_cldf), method='sca')) ds = dataset_cldf.cldf_specs().get_dataset() res = list(lingpy_util.iter_cognates(ds, method='lexstat')) assert res lingpy_util.iter_alignments(ds, res) assert 'Alignment' in res[0] ds = dataset_cldf.cldf_specs().get_dataset() res = list(lingpy_util.iter_cognates(ds, method='lexstat')) lingpy_util.iter_alignments(lingpy_util._cldf2wordlist(ds), res, almkw=dict(ref='lid', row='parameter_id', transcription='form', segments='segments', col='language_id')) assert 'Alignment' in res[0]
def cldf(dataset, concepticon, **kw): gloss2con = {x['GLOSS']: x['CONCEPTICON_ID'] for x in dataset.concepts} lang2glot = {x['NAME']: x['GLOTTOCODE'] for x in dataset.languages} for dset, srckey in zip(DSETS, sources): wl = lp.Wordlist(dataset.raw.joinpath(dset).as_posix()) if 'tokens' not in wl.header: wl.add_entries('tokens', 'ipa', lp.ipa2tokens, merge_vowels=False, expand_nasals=True) src = getEvoBibAsSource(srckey) with CldfDataset(('ID', 'Language_ID', 'Language_name', 'Language_iso', 'Parameter_ID', 'Parameter_name', 'Value', 'Source', 'Segments', 'Cognacy', 'Loan'), dataset, subset=dset.split('.')[0]) as ds: ds.sources.add(src) errors = [] cognates = [] for k in wl: concept = wl[k, 'concept'] if '(V)' in concept: concept = concept[:-4] concept = correct_concepts.get(concept, concept) if concept not in gloss2con: errors += [concept] doculect = correct_languages.get(wl[k, 'doculect'], wl[k, 'doculect']) loan = wl[k, 'cogid'] < 0 cogid = abs(wl[k, 'cogid']) wid = '{0}-{1}'.format(dset.split('.')[0], k) ds.add_row([ wid, lang2glot[doculect], wl[k, 'doculect'], '', gloss2con.get(wl[k, 'concept'], ''), wl[k, 'concept'], wl[k, 'ipa'], srckey, ' '.join(wl[k, 'tokens'] or ['']), cogid, wl[k, 'loan'] ]) cognates.append([ wid, ds.name, wl[k, 'ipa'], cogid, 'borrowed' if loan else '', 'expert', srckey, '', '', '' ]) dataset.cognates.extend( iter_alignments(lp.Alignments(wl), cognates, method='library')) for er in sorted(set(errors)): print(er, dset)
def cldf(dataset, concepticon, **kw): gcode = {x['ID']: x['GLOTTOCODE'] for x in dataset.languages} ccode = { x.english: x.concepticon_id for x in dataset.conceptlist.concepts.values() } data = defaultdict(dict) for fname in dataset.raw.glob('*.csv'): read_csv(fname, data) cognatesets = [] with CldfDataset(('ID', 'Language_ID', 'Language_name', 'Parameter_ID', 'Parameter_name', 'Value', 'Segments'), dataset) as ds: for doculect, wl in data.items(): for concept, (form, loan, cogset) in wl.items(): wid = '%s-%s' % (slug(doculect), slug(concept)) if concept in ccode: csid = ccode[concept] elif concept.startswith('to ') and concept[3:] in ccode: csid = ccode[concept[3:]] else: csid = None ds.add_row([ wid, gcode[doculect.split('-')[0]], doculect, csid, concept, form, '', ]) if cogset: cognatesets.append([ wid, ds.name, form, '%s-%s' % (slug(concept), cogset), False, 'expert', '', '', '', '', ]) segmentize(ds, clean=lambda s: s.split(' ~ ')[0]) dataset.cognates.extend(iter_alignments(ds, cognatesets, column='Segments'))
def cldf(dataset, concepticon, **kw): orig_ds = Dataset.from_name('baidial') orig_ds.commands.cldf(dataset, concepticon, **kw) for cldfds in dataset.iter_cldf_datasets(): for attr in ['dc:isVersionOf', 'dc:provenance']: cldfds.table[attr] = dataset.md[attr] cldfds.write(outdir=dataset.cldf_dir) # assuming that we don't need anything, I only load the wordlist, align it # in lingpy, and create cognates and alignments, currently, there is no # real source, so I'll just make a fake source "List2016i", but the dataset # should be published with zenodo, ideally alm = lp.Alignments( dataset.raw.joinpath('BDS-cognates.tsv').as_posix()) cognates = wordlist2cognates(alm, cldfds, 'List2016i') dataset.cognates.extend(iter_alignments(alm, cognates))
def cldf(dataset, concepticon, **kw): concepticon = { c.english: c.concepticon_id for c in dataset.conceptlist.concepts.values() } concepticon['you (sing.)'] = concepticon['you (sing.) (thou)'] concepticon['you (pl.)'] = concepticon['you (pl.) (ye)'] concepticon['to itch/itchy'] = concepticon['to itch/to be itchy'] concepticon['medicine'] = concepticon['medicine/juice'] concepticon['excrement/shit'] = concepticon['feces/excrement/shit'] language_map = { 'Tampuon': 'Tampuan', 'Palaung-Namhsan-Taunggyi': 'Palaung-Namhsan', 'Jru-Laven\u02d0': 'Jru-Laven', 'Pnar-Jaintia': 'Pnar', 'K-Surin': 'Khmer-Surin', } languages = {} words = [] with UnicodeReader(dataset.raw.joinpath('ds.Sheet1.csv')) as reader: for i, row in enumerate(reader): if 3 <= i < 125: languages[row[1]] = row elif i > 334: words.append(row) lids = [int(float(r[0])) for r in languages.values()] assert min(lids) == 1 and max(lids) == 122 glottolog = dataset.glottocode_by_iso glottolog.update( {l['NAME']: l['GLOTTOCODE'] or None for l in dataset.languages}) sources = {} for src, langs in groupby(sorted(languages.values(), key=lambda r: r[6]), lambda r: r[6]): langs = [l[1] for l in langs] src = Source('misc', '_'.join(map(slug, langs)), title=src) for lang in langs: sources[lang] = src sources['cognates'] = getEvoBibAsSource(SOURCE) unmapped = Unmapped() with CldfDataset(( 'ID', 'Language_ID', 'Language_name', 'Language_iso', 'Parameter_ID', 'Parameter_name', 'Value', 'Segments', 'Source', 'Comment', ), dataset) as ds: ds.sources.add(*sources.values()) D = {0: ['lid', 'doculect', 'concept', 'ipa', 'tokens', 'cog']} for i, row in enumerate(words): form = row[4] if not form or form in '*-': continue assert row[1] in concepticon lang = language_map.get(row[3], row[3].strip()) assert lang in languages gc = glottolog.get(glottolog.get(languages[lang][7]), lang) if not gc: unmapped.languages.add(('', lang, languages[lang][7])) # get segments segments = clean_string(form)[0] # get cognate identifier cogid = row[5] if row[5].strip() and row[5].strip() != '*' else ( 'e%s' % i) cogid = row[1] + '-' + cogid lid = '{0}-{1}'.format(ds.name, i + 1) ds.add_row([ lid, glottolog.get(lang, glottolog.get(languages[lang][7])), lang, languages[lang][7], concepticon[row[1]], row[1], form, segments, sources[lang].id, None ]) D[i + 1] = [lid, lang, row[1], form, segments, cogid] wl = lp.Wordlist(D) wl.renumber('cog') alm = lp.Alignments(wl) dataset.cognates.extend( iter_alignments(alm, wordlist2cognates(wl, ds, SOURCE))) unmapped.pprint()
def cldf(dataset, concepticon, **kw): concepticon = { c.english: c.concepticon_id for c in dataset.conceptlist.concepts.values() } language_map = { l['NAME']: l['GLOTTOCODE'] or None for l in dataset.languages } header, rows = None, [] with UnicodeReader( dataset.raw.joinpath( 'Semitic.Wordlists.ActualWordlists.csv')) as reader: for i, row in enumerate(reader): row = [c.strip() for c in row] if i == 0: header = row if i > 0: rows.append(row) cheader, crows = None, [] with UnicodeReader( dataset.raw.joinpath( 'Semitic.Codings.Multistate.Sheet1.csv')) as reader: for i, row in enumerate(reader): row = [c.strip() for c in row] if i == 0: cheader = row if i > 0: crows.append(row) langs = header[1:] clean_langs = { """Gɛ'ɛz""": "Ge'ez", "Tigrɛ": "Tigre", 'ʷalani': "Walani", "Ogadɛn Arabic": "Ogaden Arabic", "Mɛhri": "Mehri", "Gibbali": "Jibbali", } correct_concepts = { 'Cold (air)': 'Cold (of air)', } src = getEvoBibAsSource('Kitchen2012') with CldfDataset(('ID', 'Language_ID', 'Language_name', 'Parameter_ID', 'Parameter_name', 'Value', 'Segments'), dataset) as ds: D = {0: ['doculect', 'concept', 'ipa', 'tokens']} idx = 1 ds.sources.add(src) for row in rows: concept = row[0] for i, col in enumerate(row[1:]): lang = langs[i] if col != '---': cleaned_string = clean_string(col, merge_vowels=False, preparse=PREPARSE, rules=CONVERSION, semi_diacritics='')[0] ds.add_row([ 'Kitchen2012-' + str(idx), language_map[lang], clean_langs.get(lang, lang), concepticon[concept], concept, col, cleaned_string ]) D[idx] = [ clean_langs.get(lang, lang), concept, col, cleaned_string ] idx += 1 wl = lp.Wordlist(D) id2cog = {} errors = [] for row in crows: taxon = row[0] for i, (concept, cog) in enumerate(zip(cheader[1:], row[1:])): nconcept = rows[i][0] if cog != '-': idxs = wl.get_dict(taxon=taxon) if idxs.get(nconcept, ''): id2cog[idxs[nconcept][0]] = concept + '-' + cog else: errors += [(concept, nconcept, taxon)] bad_cogs = 1 cognates = [] for k in wl: cognates = [] if k in id2cog: cogid = id2cog[k] else: cogid = str(bad_cogs) bad_cogs += 1 id2cog[k] = cogid wl.add_entries('cog', id2cog, lambda x: x) wl.renumber('cog') for k in wl: cognates += [[ 'Kitchen2012-' + str(k), ds.name, wl[k, 'ipa'], wl[k, 'concept'] + '-' + str(wl[k, 'cogid']), '', 'expert', 'Kitchen2012', '', '', '' ]] dataset.cognates.extend(iter_alignments(lp.Alignments(wl), cognates))
def cldf(dataset, concepticon, **kw): concept_map = { c.english: c.concepticon_id for c in dataset.conceptlist.concepts.values() } glotto_map = {c['NAME']: c['GLOTTOCODE'] for c in dataset.languages} # retrieve coordinates coords = {} langs = [] # language map, as the names are not identical language_map = { "Namhsan": "Nam Hsan", "Pangkham": "Pang Kham", "Xiang Zhai Tang (Xiang Cai Tang)": "Xiang Zhai Tang" } with UnicodeReader( dataset.raw.joinpath('100item-phylo.Sheet2.csv')) as reader: for i, (num, lat, lon, village, country) in enumerate(reader): if i >= 1: coords[language_map.get(village, village)] = (lat, lon) langs.append(language_map.get(village, village)) cognates = [] idx = 1 with UnicodeReader(dataset.raw.joinpath('100item-phylo.Sheet1.csv'), delimiter=',') as reader,\ CldfDataset(( 'ID', 'Language_ID', 'Language_name', 'Language_iso', 'Parameter_ID', 'Parameter_name', 'Value', 'Source', 'Segments', 'Cognacy' ), dataset) as ds: ds.sources.add(getEvoBibAsSource('Deepadung2015')) ds.metadata['coordinates'] = coords data = list(reader) header = data[2][2:] for i, row in enumerate(data[5:]): row = [c.strip() for c in row] concept = row[1] cid = concept_map[concept] for j in range(0, len(header), 2): lang = language_map.get(header[j], header[j]) gcid = glotto_map[lang] cog = slug(concept) + '-' + row[2:][j + 1] certainty = 0 if ' or ' in cog: cog = cog.split(' ')[0] certainty = 1 word = CORRECT.get(row[2:][j], row[2:][j]) if word.strip() and ''.join(set(word.strip())) != '-': segments = lp.sequence.sound_classes.clean_string( word, splitters=',', rules=CONVERSION, preparse=PREPARSE, semi_diacritics="")[0] cogid = slug(concept) + '-' + cog ds.add_row([ idx, gcid, lang, '', cid, concept, word, PROVIDER, segments, cogid ]) cognates.append([ idx, ds.name, word, cogid, str(certainty), 'expert', PROVIDER, '', '', '' ]) idx += 1 dataset.cognates.extend( iter_alignments( ds, cognates, method='progressive', ))
def cldf(dataset, concepticon, **kw): concepticon = { x.english: x.concepticon_id for x in dataset.conceptlist.concepts.values()} lmap = {l['ID']: l['GLOTTOCODE'] or None for l in dataset.languages} lmap_name = {l['ID']: l['NAME'] or None for l in dataset.languages} cognate_sets = defaultdict(list) for (cid, c), w, missing in parse(dataset.raw.joinpath('galucio-tupi.txt'), lmap): assert c in concepticon if c in LANGUAGE_ID_FIXES: f, t = LANGUAGE_ID_FIXES[c] w = re.sub(f + '\s+', t + ' ', w, count=1) missing = re.sub(f + '\s+', t + ' ', missing, count=1) if missing: assert re.match( '((?P<lid>%s)\s*\?\s*)+$' % '|'.join(list(lmap.keys())), missing) missing = missing.replace('?', ' ').split() lids = set(missing[:]) for m in re.finditer('(?P<lid>[A-Z][a-z])\s+', w): lids.add(m.group('lid')) # make sure all language IDs are valid assert not lids.difference(set(lmap.keys())) nlids = missing[:] for cs in iter_cogsets(w, lmap): cognate_sets[(cid, c)].append(cs) nlids.extend(list(cs.keys())) nlids = set(nlids) assert nlids == lids # make sure we found all expected language IDs cognatesets = [] with CldfDataset( ('ID', 'Language_ID', 'Language_name', 'Language_local_ID', 'Parameter_ID', 'Parameter_name', 'Parameter_local_ID', 'Value', 'Segments'), dataset) as ds: for (cid, concept), cogsets in cognate_sets.items(): for j, cogset in enumerate(cogsets): for lid, words in sorted(cogset.items(), key=lambda k: k[0]): for i, word in enumerate(words): wid = '%s-%s-%s-%s' % (lid, cid, j + 1, i + 1) ds.add_row([ wid, lmap[lid], lmap_name[lid], lid, concepticon[concept], concept, cid, word, '', ]) cognatesets.append([ wid, ds.name, word, '%s-%s' % (cid, j + 1), False, 'expert', '', '', '', '', ]) segmentize(ds, clean=lambda s: s.split(' ~ ')[0]) dataset.cognates.extend(iter_alignments(ds, cognatesets, column='Segments'))
def cldf(dataset, concepticon, **kw): language_map = { l['NAME']: l['GLOTTOCODE'] or None for l in dataset.languages } concept_map = { c.english: c.concepticon_id for c in dataset.conceptlist.concepts.values() } concept_map[ 'year'] = '1226' # dunno why this is missing, it's 200 words... wordlists = list(read_csv(dataset)) cogsets = defaultdict(lambda: defaultdict(list)) for wl in wordlists: for concept, (words, cogids) in wl.words.items(): if len(cogids) == 1: cogsets[concept][cogids[0]].append(words[0]) with CldfDataset(( 'ID', 'Language_ID', 'Language_name', 'Parameter_ID', 'Parameter_name', 'Value', 'Segments', 'Source', 'Comment', ), dataset) as ds: ds.sources.add(getEvoBibAsSource(SOURCE)) cognates = [] for wl in wordlists: #print(wl.language) for concept, (words, cogids) in wl.words.items(): if len(cogids) > 1: if len(words) < len(cogids): if len(words) == 1: if ':' in words[0]: words = words[0].split(':') if ',' in words[0]: words = words[0].split(',') assert len(words) >= len(cogids) assert (wl.language, concept) in COGSET_MAP if len(words) > len(cogids): assert (wl.language, concept) in COGSET_MAP if (wl.language, concept) in COGSET_MAP: word_to_cogid = COGSET_MAP[(wl.language, concept)] else: word_to_cogid = dict(izip_longest(words, cogids)) for i, word in enumerate(words): if word.startswith('(') and word.endswith(')'): word = word[1:-1].strip() wid = '%s-%s-%s' % (slug( wl.language), slug(concept), i + 1) ds.add_row([ wid, '', wl.language, concept_map.get(concept, ''), concept, word, clean_string(word, splitters='?')[0], SOURCE, '', ]) if word_to_cogid.get(word): cognates.append([ wid, ds.name, word, '%s-%s' % (slug(concept), word_to_cogid[word]), False, 'expert', SOURCE, '', '', '', ]) dataset.cognates.extend( iter_alignments(ds, cognates, column='Segments'))
def cldf(dataset, concepticon, **kw): """ Implements the conversion of the raw data to CLDF dataset(s). :param dataset: provides access to the information in supplementary files as follows:\ - the JSON object from `metadata.json` is available as `dataset.md`\ - items from languages.csv are available as `dataset.languages`\ - items from concepts.csv are available as `dataset.concepts`\ - if a Concepticon conceptlist was specified in metadata.json, its ID is available\ as `dataset.conceptlist` :param glottolog: a pyglottolog.api.Glottolog` instance. :param concepticon: a pyconcepticon.api.Concepticon` instance. :param kw: All arguments passed on the command line. """ wl = lp.Wordlist(dataset.raw.joinpath(DSET).as_posix()) # get language identifiers lids, cids, coords = {}, {}, {} for row in dataset.languages: language = row['NAME'] lids[language] = row['GLOTTOCODE'] coords = dict([wl.coords[taxon] for taxon in lids]) modify = { 'thunder (verb)': 'thunder', 'flash (verb)': 'lightning', 'room': 'flat', 'have diarrea': 'have diarrhoea', 'watery': 'light' } for row in dataset.concepts: concept = modify[row['CONCEPT']] if row['CONCEPT'] in modify else \ row['CONCEPT'] cids[concept] = row['CONCEPT_SET'] # language ids src = getEvoBibAsSource(SOURCE) src2 = getEvoBibAsSource('List2014b') # get partial identifiers partial_ids = defaultdict(list) partial_converter = {} idx = 1 for k in wl: for char in wl[k, 'counterpart']: if char in partial_converter: pidx = partial_converter[char] else: pidx = idx partial_converter[char] = idx idx += 1 partial_ids[k] += [pidx] # trace if proto-langugages was visited visited = [] idx = max([k for k in wl]) + 1 with CldfDataset( ('ID', 'Language_ID', 'Language_name', 'Language_iso', 'Parameter_ID', 'Parameter_name', 'Parameter_Chinese_name', 'Value', 'Value_Chinese_characters', 'Source', 'Segments', 'Cognacy', 'Rank', 'Comment'), dataset) as ds: ds.sources.add(src) ds.sources.add(src2) D = {0: ['doculect', 'concept', 'ipa', 'tokens', 'cogid']} for k in wl: tokens = lp.ipa2tokens(wl[k, 'ipa'], merge_vowels=False, expand_nasals=True) # remove sandhi-annotation in tokens, as it is confusing clpa for i, t in enumerate(tokens): if '⁻' in t: tokens[i] = t[:t.index('⁻')] ds.add_row([ '{0}-{1}'.format(SOURCE, k), lids[wl[k, 'doculect']], wl[k, 'doculect'], '', cids[wl[k, 'concept']], wl[k, 'concept'], wl[k, 'mandarin'], wl[k, 'ipa'], wl[k, 'counterpart'], SOURCE, ' '.join(tokens), wl[k, 'cogid'], wl[k, 'order'], wl[k, 'note'] if wl[k, 'note'] != '-' else '', ]) D[k] = [ wl[k, 'doculect'], wl[k, 'concept'], wl[k, 'ipa'], tokens, wl[k, 'cogid'] ] if wl[k, 'cogid'] not in visited: # we need to add new tones, otherwise it won't work, so we # split syllables first, then check if the syllable ends with # tone or not and add a '1' if this is not the case syllables = wl[k, 'mch'].split('.') for i, s in enumerate(syllables): if s[-1] not in '²³': if s[-1] not in 'ptk': syllables[i] += '¹' else: syllables[i] += '⁴' tokens = lp.ipa2tokens(''.join(syllables)) ds.add_row([ '{0}-{1}'.format(wl[k, 'concept'], idx), 'sini1245', 'Middle Chinese', '', cids[wl[k, 'concept']], wl[k, 'concept'], '', wl[k, 'proto'], wl[k, 'counterpart'], SOURCE, ' '.join(tokens), wl[k, 'cogid'], '', '' ]) D[idx] = [ 'Middle Chinese', wl[k, 'concept'], wl[k, 'mch'], tokens, wl[k, 'cogid'] ] idx += 1 visited += [wl[k, 'cogid']] alms = lp.Alignments(D) cognates = [[ '{0}-{1}'.format(SOURCE, k), ds.name, alms[k, 'ipa'], '-'.join([slug(alms[k, 'concept']), str(alms[k, 'cogid'])]), '', 'expert', SOURCE, '', '', '' ] for k in alms] dataset.cognates.extend( iter_alignments(alms, cognates, method='library'))
def cldf(dataset, concepticon, **kw): language_map = { l['NAME']: l['GLOTTOCODE'] or None for l in dataset.languages } concept_map = { c.english: c.concepticon_id for c in dataset.conceptlist.concepts.values() } wordsh, words = read_csv(dataset, 'supplementary.Sheet1.csv', 0) cognatesh, cognates = read_csv(dataset, 'Japonic_recovered.Sheet1.csv', 1) def concepts(h, step): l = h[2:] return {i + 2: l[i] for i in range(0, len(l), step)} word_index_to_concept = concepts(wordsh, 1) assert all(c in concept_map for c in word_index_to_concept.values()) assert len(words) == len(cognates) def sorted_(l): return sorted(l, key=lambda r: r[:2]) cognatesets = [] with CldfDataset(( 'ID', 'Language_ID', 'Language_name', 'Parameter_ID', 'Parameter_name', 'Value', 'Segments', 'AltTranscription', ), dataset) as ds: for i, (word, cognate) in enumerate(zip(sorted_(words), sorted_(cognates))): if not word[1]: continue if word[1] == 'Nigata': word[1] = 'Niigata' assert word[:2] == cognate[:2] lname = word[1] lid = slug(lname) for index, concept in word_index_to_concept.items(): if word[index] == '?': continue wid = '%s-%s' % (lid, index - 1) cindex = (index - 1) * 2 assert cognatesh[cindex] == concept ds.add_row([ wid, language_map[lname], lname, concept_map[concept], concept, word[index], '', cognate[cindex], ]) cs = cognate[cindex + 1] for css in cs.split('&'): css = css.strip() if css != '?': css = int(float(css)) cognatesets.append([ wid, ds.name, word[index], '%s-%s' % (index - 1, css), False, 'expert', '', '', '', '', ]) segmentize(ds) dataset.cognates.extend(iter_alignments(ds, cognatesets, column='Segments'))