def cldf(dataset, concepticon, **kw): concepticon = { c.english: c.concepticon_id for c in dataset.conceptlist.concepts.values() } language_map = { l['NAME']: l['GLOTTOCODE'] or None for l in dataset.languages } with UnicodeReader( dataset.raw.joinpath(FILENAME.replace('xls', 'Sheet1.csv'))) as r: rows = [row for row in r] concepts = [(i, rows[0][i].replace('_', ' ').strip()) for i in range(1, len(rows[0]), 2)] assert all(concept in concepticon for _, concept in concepts) with CldfDataset(('ID', 'Language_ID', 'Language_name', 'Parameter_ID', 'Parameter_name', 'Value', 'Segments', 'Source'), dataset) as ds: ds.table.schema.columns['Value']['dc:format'] = 'IPA' ds.sources.add(getEvoBibAsSource(SOURCE)) for row in rows[3:]: row = [col.strip() for col in row] if not row[0]: continue lname = row[0] for i, concept in concepts: for j, form in iterforms(row[i]): if form != '?' and form.strip(): ds.add_row([ '%s-%s-%s' % (slug(lname), (i + 1) // 2, j), language_map[lname], lname.replace('_', ' '), concepticon[concept], concept, form, ' '.join(clean_string(form)), SOURCE ]) # three methods: turchin, sca, lexstat, turchin is fast (needs not threshold) cognates = iter_cognates(ds, column='Segments', method='turchin', threshold=0.55) # two methods for alignments: progressive or library dataset.cognates.extend( iter_alignments(ds, cognates, column='Segments', method='progressive')) dataset.write_cognates()
def cldf(dataset, concepticon, **kw): wl = lp.Wordlist(dataset.raw.joinpath(DSET).as_posix()) gcode = {x['NAME']: x['GLOTTOCODE'] for x in dataset.languages} src = getEvoBibAsSource(SOURCE) with CldfDataset(('ID', 'Language_ID', 'Language_name', 'Language_iso', 'Parameter_ID', 'Parameter_name', 'Parameter_Chinese_name', 'Value', 'Segments', 'Source'), dataset) as ds: ds.sources.add(src) for k in wl: if wl[k, 'value'] not in '---' and wl[k, 'value'].strip(): ds.add_row([ wl[k, 'lid'], gcode[wl[k, 'doculect']], wl[k, 'doculect'], '', wl[k, 'concepticon_id'], wl[k, 'concept'], wl[k, 'chinese'], wl[k, 'value'], clean_string(wl[k, 'value'])[0], SOURCE ])
def cldf(dataset, concepticon, **kw): concepticon = { c.english: c.concepticon_id for c in dataset.conceptlist.concepts.values() } concepticon['you (sing.)'] = concepticon['you (sing.) (thou)'] concepticon['you (pl.)'] = concepticon['you (pl.) (ye)'] concepticon['to itch/itchy'] = concepticon['to itch/to be itchy'] concepticon['medicine'] = concepticon['medicine/juice'] concepticon['excrement/shit'] = concepticon['feces/excrement/shit'] language_map = { 'Tampuon': 'Tampuan', 'Palaung-Namhsan-Taunggyi': 'Palaung-Namhsan', 'Jru-Laven\u02d0': 'Jru-Laven', 'Pnar-Jaintia': 'Pnar', 'K-Surin': 'Khmer-Surin', } languages = {} words = [] with UnicodeReader(dataset.raw.joinpath('ds.Sheet1.csv')) as reader: for i, row in enumerate(reader): if 3 <= i < 125: languages[row[1]] = row elif i > 334: words.append(row) lids = [int(float(r[0])) for r in languages.values()] assert min(lids) == 1 and max(lids) == 122 glottolog = dataset.glottocode_by_iso glottolog.update( {l['NAME']: l['GLOTTOCODE'] or None for l in dataset.languages}) sources = {} for src, langs in groupby(sorted(languages.values(), key=lambda r: r[6]), lambda r: r[6]): langs = [l[1] for l in langs] src = Source('misc', '_'.join(map(slug, langs)), title=src) for lang in langs: sources[lang] = src sources['cognates'] = getEvoBibAsSource(SOURCE) unmapped = Unmapped() with CldfDataset(( 'ID', 'Language_ID', 'Language_name', 'Language_iso', 'Parameter_ID', 'Parameter_name', 'Value', 'Segments', 'Source', 'Comment', ), dataset) as ds: ds.sources.add(*sources.values()) D = {0: ['lid', 'doculect', 'concept', 'ipa', 'tokens', 'cog']} for i, row in enumerate(words): form = row[4] if not form or form in '*-': continue assert row[1] in concepticon lang = language_map.get(row[3], row[3].strip()) assert lang in languages gc = glottolog.get(glottolog.get(languages[lang][7]), lang) if not gc: unmapped.languages.add(('', lang, languages[lang][7])) # get segments segments = clean_string(form)[0] # get cognate identifier cogid = row[5] if row[5].strip() and row[5].strip() != '*' else ( 'e%s' % i) cogid = row[1] + '-' + cogid lid = '{0}-{1}'.format(ds.name, i + 1) ds.add_row([ lid, glottolog.get(lang, glottolog.get(languages[lang][7])), lang, languages[lang][7], concepticon[row[1]], row[1], form, segments, sources[lang].id, None ]) D[i + 1] = [lid, lang, row[1], form, segments, cogid] wl = lp.Wordlist(D) wl.renumber('cog') alm = lp.Alignments(wl) dataset.cognates.extend( iter_alignments(alm, wordlist2cognates(wl, ds, SOURCE))) unmapped.pprint()
def cldf(dataset, concepticon, **kw): concepticon = { c.english: c.concepticon_id for c in dataset.conceptlist.concepts.values() } language_map = { l['NAME']: l['GLOTTOCODE'] or None for l in dataset.languages } header, rows = None, [] with UnicodeReader( dataset.raw.joinpath( 'Semitic.Wordlists.ActualWordlists.csv')) as reader: for i, row in enumerate(reader): row = [c.strip() for c in row] if i == 0: header = row if i > 0: rows.append(row) cheader, crows = None, [] with UnicodeReader( dataset.raw.joinpath( 'Semitic.Codings.Multistate.Sheet1.csv')) as reader: for i, row in enumerate(reader): row = [c.strip() for c in row] if i == 0: cheader = row if i > 0: crows.append(row) langs = header[1:] clean_langs = { """Gɛ'ɛz""": "Ge'ez", "Tigrɛ": "Tigre", 'ʷalani': "Walani", "Ogadɛn Arabic": "Ogaden Arabic", "Mɛhri": "Mehri", "Gibbali": "Jibbali", } correct_concepts = { 'Cold (air)': 'Cold (of air)', } src = getEvoBibAsSource('Kitchen2012') with CldfDataset(('ID', 'Language_ID', 'Language_name', 'Parameter_ID', 'Parameter_name', 'Value', 'Segments'), dataset) as ds: D = {0: ['doculect', 'concept', 'ipa', 'tokens']} idx = 1 ds.sources.add(src) for row in rows: concept = row[0] for i, col in enumerate(row[1:]): lang = langs[i] if col != '---': cleaned_string = clean_string(col, merge_vowels=False, preparse=PREPARSE, rules=CONVERSION, semi_diacritics='')[0] ds.add_row([ 'Kitchen2012-' + str(idx), language_map[lang], clean_langs.get(lang, lang), concepticon[concept], concept, col, cleaned_string ]) D[idx] = [ clean_langs.get(lang, lang), concept, col, cleaned_string ] idx += 1 wl = lp.Wordlist(D) id2cog = {} errors = [] for row in crows: taxon = row[0] for i, (concept, cog) in enumerate(zip(cheader[1:], row[1:])): nconcept = rows[i][0] if cog != '-': idxs = wl.get_dict(taxon=taxon) if idxs.get(nconcept, ''): id2cog[idxs[nconcept][0]] = concept + '-' + cog else: errors += [(concept, nconcept, taxon)] bad_cogs = 1 cognates = [] for k in wl: cognates = [] if k in id2cog: cogid = id2cog[k] else: cogid = str(bad_cogs) bad_cogs += 1 id2cog[k] = cogid wl.add_entries('cog', id2cog, lambda x: x) wl.renumber('cog') for k in wl: cognates += [[ 'Kitchen2012-' + str(k), ds.name, wl[k, 'ipa'], wl[k, 'concept'] + '-' + str(wl[k, 'cogid']), '', 'expert', 'Kitchen2012', '', '', '' ]] dataset.cognates.extend(iter_alignments(lp.Alignments(wl), cognates))
def cldf(dataset, concepticon, **kw): language_map = { l['NAME']: l['GLOTTOCODE'] or None for l in dataset.languages } concept_map = { c.english: c.concepticon_id for c in dataset.conceptlist.concepts.values() } concept_map[ 'year'] = '1226' # dunno why this is missing, it's 200 words... wordlists = list(read_csv(dataset)) cogsets = defaultdict(lambda: defaultdict(list)) for wl in wordlists: for concept, (words, cogids) in wl.words.items(): if len(cogids) == 1: cogsets[concept][cogids[0]].append(words[0]) with CldfDataset(( 'ID', 'Language_ID', 'Language_name', 'Parameter_ID', 'Parameter_name', 'Value', 'Segments', 'Source', 'Comment', ), dataset) as ds: ds.sources.add(getEvoBibAsSource(SOURCE)) cognates = [] for wl in wordlists: #print(wl.language) for concept, (words, cogids) in wl.words.items(): if len(cogids) > 1: if len(words) < len(cogids): if len(words) == 1: if ':' in words[0]: words = words[0].split(':') if ',' in words[0]: words = words[0].split(',') assert len(words) >= len(cogids) assert (wl.language, concept) in COGSET_MAP if len(words) > len(cogids): assert (wl.language, concept) in COGSET_MAP if (wl.language, concept) in COGSET_MAP: word_to_cogid = COGSET_MAP[(wl.language, concept)] else: word_to_cogid = dict(izip_longest(words, cogids)) for i, word in enumerate(words): if word.startswith('(') and word.endswith(')'): word = word[1:-1].strip() wid = '%s-%s-%s' % (slug( wl.language), slug(concept), i + 1) ds.add_row([ wid, '', wl.language, concept_map.get(concept, ''), concept, word, clean_string(word, splitters='?')[0], SOURCE, '', ]) if word_to_cogid.get(word): cognates.append([ wid, ds.name, word, '%s-%s' % (slug(concept), word_to_cogid[word]), False, 'expert', SOURCE, '', '', '', ]) dataset.cognates.extend( iter_alignments(ds, cognates, column='Segments'))
def clean_string_with_validation(string): try: return ' '.join(clean_string(string)) except IndexError: return None