def cmd_makecldf(self, args): # add the bibliographic sources args.writer.add_sources() # add the languages from the language list (no need for mapping here) args.writer.add_languages() # add the concepts from the concept list concept_lookup = {} for concept in self.conceptlists[0].concepts.values(): cid = "%s_%s" % (concept.id.split("-")[-1], slug(concept.english)) args.writer.add_concept( ID=cid, Name=concept.english, NorthEuralex_Gloss=concept.attributes["nelex_id"], Concepticon_ID=concept.concepticon_id, Concepticon_Gloss=concept.concepticon_gloss, ) concept_lookup[concept.attributes["nelex_id"]] = cid # add items lexeme_rows = self.raw_dir.read_csv("nelex.tsv", delimiter="\t", dicts=True) for row in pylexibank.progressbar(lexeme_rows): args.writer.add_form( Language_ID=row["Language_ID"], Parameter_ID=concept_lookup[row["Concept_ID"]], Value=row["Word_Form"], Form=row["rawIPA"].strip().replace(" ", "_"), Source=["Dellert2020"], )
def cmd_makecldf(self, args): data = self.raw_dir.read_csv('raw.csv', dicts=True) args.writer.add_sources() languages = args.writer.add_languages(lookup_factory="Name") concepts = {} for concept in self.conceptlists[0].concepts.values(): idx = concept.number + "_" + slug(concept.english) args.writer.add_concept( ID=idx, Category=concept.attributes['category'], Name=concept.english, Concepticon_ID=concept.concepticon_id, Concepticon_Gloss=concept.concepticon_gloss) concepts[concept.english] = idx for row in pylexibank.progressbar(data): for language, lexeme in row.items(): if language in languages: args.writer.add_forms_from_value( Language_ID=languages[language], Parameter_ID=concepts[row["English"]], Value=lexeme, Category=row["Category"], Source="Sawka2019", )
def cmd_makecldf(self, args): args.writer.add_sources() wl = lingpy.Wordlist(self.dir.joinpath("raw", "wordlist.tsv").as_posix()) concepts = {} strip_concept = lambda x: x.replace(" ", "").replace("*", "") for concept in self.conceptlists[0].concepts.values(): args.writer.add_concept( ID=concept.id, Name=concept.english, Chinese_Gloss=strip_concept(concept.attributes["chinese"]), Concepticon_ID=concept.concepticon_id, Concepticon_Gloss=concept.concepticon_gloss, ) concepts[strip_concept(concept.attributes["chinese"])] = concept.id langs = {k["ChineseName"]: k["ID"] for k in self.languages} args.writer.add_languages() for idx in pylexibank.progressbar(wl, desc="cldfify"): args.writer.add_form_with_segments( Language_ID=langs[wl[idx, "doculect"]], Parameter_ID=concepts[strip_concept(wl[idx, "concept"])], Value=wl[idx, "value"], Form=wl[idx, "form"], Segments=wl[idx, "tokens"], Source=["Castro2010a"], )
def cmd_makecldf(self, args): wl = lingpy.Wordlist(self.raw_dir.joinpath("yi-wl.tsv").as_posix()) args.writer.add_sources() languages = args.writer.add_languages(lookup_factory="Name") concepts = {} for concept in self.conceptlists[0].concepts.values(): idx = concept.id.split("-")[-1] + "_" + slug(concept.english) args.writer.add_concept( ID=idx, Name=concept.english, Concepticon_ID=concept.concepticon_id, Concepticon_Gloss=concept.concepticon_gloss, Chinese_Gloss=concept.attributes["chinese"], ) concepts[concept.english] = idx concepts["Daughter-in-law"] = concepts["daughter-in-law"] for idx in pylexibank.progressbar(wl, desc="cldfify", total=len(wl)): args.writer.add_form_with_segments( Language_ID=languages[wl[idx, "doculect"]], Parameter_ID=concepts[wl[idx, "concept"]], Value=wl[idx, "value"], Form=wl[idx, "form"], Segments=wl[idx, "tokens"], Source=["Castro2010"], )
def cmd_makecldf(self, args): args.writer.add_sources() language_lookup = args.writer.add_languages(lookup_factory="Name") concept_lookup = {} for concept in self.conceptlists[0].concepts.values(): idx = concept.id.split("-")[-1] + "_" + slug(concept.english) concept_lookup[concept.english] = idx concept_lookup[concept.number] = idx args.writer.add_concept( ID=idx, Concepticon_ID=concept.concepticon_id, Concepticon_Gloss=concept.concepticon_gloss, Name=concept.english, Chinese_Gloss=concept.attributes["chinese"], ) for entry in pylexibank.progressbar( self.raw_dir.read_csv("ZMYYC.csv", delimiter="\t", dicts=True)): args.writer.add_forms_from_value( Language_ID=language_lookup[entry["language"]], Parameter_ID=concept_lookup.get(entry["srcid"].split(".")[0]), Local_ID=entry["rn"], Value=entry["reflex"], Source=["Sun1991"], )
def cmd_makecldf(self, args): args.writer.add_sources() sources = {} for language in self.languages: sources[language["ID"]] = [ x.lower() for x in language["Source"].split(",") ] args.writer.add_language(**language) segments = { "ž": "ʒ", "nˈ": "nʲ", "lˈ": "lʲ", "gˈ": "gʲ", "rˈ": "rʲ", "pˈ": "pʲ", "s-": "s", "š": "ʃ", "βˈ": "βʲ", "sˈ": "sʲ", "tʃ": "tɕ", "ʦ": "ts", "_": "+", "ch": "x", } concepts = args.writer.add_concepts( id_factory=lambda x: x.id.split("-")[-1] + "_" + slug(x.english), lookup_factory="Name") for row in progressbar( self.raw_dir.read_csv(DATAFILE, delimiter="\t", dicts=True)): if row["ID"].startswith("#"): # skip lingpy stuff continue # patch two weird/broken entries: if row["ID"] == "7560": row["TOKENS"] = "ɕ i v ɘ ʨ" if row["ID"] == "8367": row["ENTRY"] = "ʒɯl" segs = [segments.get(x, x) for x in row["TOKENS"].split()] lex = args.writer.add_form_with_segments( Local_ID=row["ID"], Language_ID=row["DOCULECT"], Parameter_ID=concepts.get(row["CONCEPT"]), Value=row["ENTRY"], # sometimes the FORM value is empty for some reason. # if so we use the parsed 'segments' field by removing spaces. Form=row["FORM"] if row["FORM"] else "".join(segs), Segments=segs, Source=sources.get(row["DOCULECT"]) or [""], ) args.writer.add_cognate( lexeme=lex, Cognateset_ID=row["COGID"], Alignment=[segments.get(x, x) for x in row["ALIGNMENT"]], Root=row["ROOT"], )
def cmd_makecldf(self, args): data = self.raw_dir.read_csv("raw.csv", dicts=True) languages = args.writer.add_languages(lookup_factory="Name") args.writer.add_sources() concepts = args.writer.add_concepts( id_factory=lambda c: c.id.split("-")[-1] + "_" + slug(c.english), lookup_factory="Name" ) for row in pylexibank.progressbar(data): for language, lexeme in row.items(): if language in languages: args.writer.add_forms_from_value( Language_ID=languages[language], Parameter_ID=concepts[row["English gloss"]], Value=lexeme, AmharicGloss=row["Amharic gloss"], Source="Bremer2016", ) # We explicitly remove the ISO column since none of the languages in # this dataset have an ISO code. args.writer.cldf["LanguageTable"].tableSchema.columns = [ col for col in args.writer.cldf["LanguageTable"].tableSchema.columns if col.name != "ISO639P3code" ]
def cmd_makecldf(self, args): args.writer.add_sources() # add concepts from list concepts = {} for concept in self.concepts: cid = '{0}_{1}'.format(concept['NUMBER'], slug(concept['ENGLISH'])) args.writer.add_concept( ID=cid, Number=concept['NUMBER'], Name=concept['ENGLISH'], Concepticon_ID=concept["CONCEPTICON_ID"], Concepticon_Gloss=concept["CONCEPTICON_GLOSS"]) concepts[concept["ENGLISH"]] = cid args.writer.add_languages() wl = Wordlist( self.raw_dir.joinpath('lundgren_ma_analyzed_data.tsv').as_posix()) for idx in progressbar(wl): lexeme = args.writer.add_form_with_segments( Local_ID=idx, Language_ID=wl[idx, 'doculect'], Parameter_ID=concepts[wl[idx, 'concept']], Value=wl[idx, 'ipa'] or ''.join(wl[idx, 'tokens']), Form=wl[idx, 'ipa'] or ''.join(wl[idx, 'tokens']), Segments=[{ '_': '+' }.get(x, x) for x in wl[idx, 'tokens']], Source=['Lundgren2020']) args.writer.add_cognate(lexeme=lexeme, Cognateset_ID=wl[idx, 'cogid'], Alignment=wl[idx, 'alignment'], Source=['Lundgren2020'])
def cmd_makecldf(self, args): data = self.raw_dir.read_csv('raw.csv', dicts=True) languages, concepts = {}, {} for concept in self.conceptlists[0].concepts.values(): idx = concept.id.split('-')[-1] + '_' + slug(concept.gloss) args.writer.add_concept( ID=idx, Name=concept.gloss, Concepticon_ID=concept.concepticon_id, Concepticon_Gloss=concept.concepticon_gloss, Chinese_Gloss=concept.attributes['chinese']) concepts[concept.attributes['chinese']] = idx args.writer.add_languages() languages = args.writer.add_languages(lookup_factory='Name') args.writer.add_sources() missing = {} for cgloss, entry in progressbar(enumerate(data), desc='cldfify the data', total=len(data)): if entry['Chinese gloss'] in concepts.keys(): for language in languages: if entry[language].strip(): lexemes = args.writer.add_lexemes( Language_ID=languages[language], Parameter_ID=concepts[entry['Chinese gloss']], Value=entry[language], Source=['Chen2012']) else: missing[entry["Chinese gloss"]] += 1
def cmd_makecldf(self, args): glottocode = "voro1241" reprs = ["StandardOrth"] args.writer.add_concepts(id_factory=lambda c: c.attributes['ids_id']) args.writer.add_sources(*self.raw_dir.read_bib()) personnel = self.get_personnel(args) args.writer.add_language( ID=glottocode, Name="Võro", Glottocode=glottocode, Authors=personnel['author'], DataEntry=personnel['data entry'], Consultants=personnel['consultant'], Representations=reprs, Latitude=58.0, Longitude=26.6, date='2020-09-17', ) for form in pylexibank.progressbar( self.read_csv("ids_voro1241.idsclldorg.csv")): if form.form: args.writer.add_lexemes( Language_ID=glottocode, Parameter_ID=form.ids_id, Value=form.form, Comment=form.comment, Source="cosgrove2020", Transcriptions=reprs, ) self.apply_cldf_defaults(args)
def cmd_makecldf(self, args): concepts = {} for concept in self.conceptlists[0].concepts.values(): idx = '{0}_{1}'.format(concept.number, slug(concept.english)) args.writer.add_concept( ID=idx, Name=concept.english, Chinese_Gloss=concept.attributes['chinese'], Concepticon_ID=concept.concepticon_id, Concepticon_Gloss=concept.concepticon_gloss ) concepts[concept.attributes['chinese']+' '+concept.english] = idx concepts[concept.english] = idx args.writer.add_languages() args.writer.add_sources() for row, language in progressbar(zip( self.raw_dir.read_csv( 'data.tsv', delimiter='\t', dicts=True)[1:], self.languages)): for j, (concept, entry) in enumerate(list(row.items())[1:]): if entry.strip(): pidx = concepts.get( concept, concepts.get(' '.join(concept.split(' ')[1:]), '?')) args.writer.add_form( Language_ID=language['ID'], Parameter_ID=pidx, Value=entry, Form=entry.replace(" ", "_"), Source=[language['Source']] )
def cmd_makecldf(self, args): """ Convert the raw data to a CLDF dataset. """ concepts = {} for concept in self.conceptlists[0].concepts.values(): cid = '{0}_{1}'.format(concept.number, slug(concept.english)) args.writer.add_concept( ID=cid, Name=concept.english, Concepticon_ID=concept.concepticon_id, Concepticon_Gloss=concept.concepticon_gloss, Number=concept.number ) concepts[concept.number] = cid args.log.info('[i] added concepts') languages = args.writer.add_languages(lookup_factory="Number") args.log.info('[i] added languages') args.writer.add_sources() missingL, missingC = set(), set() missingCog = set() cogids = {} for row in progressbar( self.raw_dir.read_csv('data.tsv', delimiter='\t', dicts=True)): lid = languages.get(row['LANGUAGE']) cid = concepts.get(row['SID']) # take only the first cognate ID if there are several cog = row['COGNATE'].split('|')[0] if lid and cid and row["FORM"] and row["FORM"].strip(): lexemes = args.writer.add_forms_from_value( Language_ID=lid, Parameter_ID=cid, Value=row["FORM"], Source='Sun1991' ) if cog.strip(): cogid = cid+'-'+cog args.writer.add_cognate( lexeme=lexemes[0], Cognateset_ID=cogid, Cognate_Detection_Method='expert', Source='Gao2020' ) else: missingCog.add(cogid) if not lid: missingL.add(lid) if not cid: missingC.add(cid) for entry in missingL: print('missing L {0}'.format(entry)) for entry in missingC: print('missing C {0}'.format(entry)) for entry in missingCog: print('missing Cognate {0}'.format(entry))
def cmd_makecldf(self, args): wl = lingpy.Wordlist(self.raw_dir.joinpath("suansu.tsv").as_posix()) converter = { "ll": "lː", "ddʑ": "dʑː", "mm": "mː", "nn": "nː", "ss": "sː", "tts": "tsː", "tʂ": "ʈʂː", "bb": "bː", "dd": "dː", "pp": "pː", "tt": "tː", "ttʰ": "tʰː", "ɹɹ": "ɹː", "ff": "fː", "je": "j e", "oj": "oi", "ph": "pʰ", "th": "tʰ", "ttɕ": "tɕː", "ttʃ": "tʃː", "ma": "m a", "ē": "e", "ê": "e", "ʈʈʂ": "ʈʂː", "I": "ɪ", "ʷ": "w", } args.writer.add_sources() concepts = {} args.writer.add_languages() for k in progressbar(wl, desc="wl-to-cldf", total=len(wl)): if wl[k, "concepticon_id"] not in concepts: cid = "{0}_{1}".format(wl[k, "concepticon_id"], slug(wl[k, "concept"])) concepts[wl[k, "concept"]] = cid args.writer.add_concept( ID=cid, Name=wl[k, "concept"], Concepticon_ID=wl[k, "concepticon_id"], Concepticon_Gloss=wl[k, "concepticon_gloss"], ) args.writer.add_form_with_segments( Language_ID="Suansu", Parameter_ID=concepts[wl[k, "concept"]], Value="".join(wl[k, "tokens"]), Form="".join(wl[k, "tokens"]), Segments=" ".join( [converter.get(x, x) for x in wl[k, "tokens"]]).split(), Source=["Ivani2019"], )
def run(args): """ main function. """ ds = get_dataset(args) if args.medials: args.medials = set(args.medials.split(',')) errors = { 'length': defaultdict(list), 'syllable': defaultdict(list), 'missing': defaultdict(list) } if ds.cldf_dir.joinpath("forms.csv").exists(): for row in progressbar(ds.cldf_reader()["FormTable"], desc='iterate over wordlist'): if row['Language_ID'] == args.doculect or not args.doculect: strucs = get_structure(row['Segments'], medials=args.medials or MEDIALS) for i, (struc, segments) in enumerate( zip(strucs, morphemes(row['Segments']))): if len(struc) != len(segments): errors['length'][' '.join(segments), ' '.join(struc)] += [ (row['ID'], i, row['Language_ID'], row['Form'], row['Segments']) ] elif '?' in struc: errors['missing'][' '.join(segments), ' '.join(struc)] += [ (row['ID'], i, row['Language_ID'], row['Form'], row['Segments']) ] elif not 'n' in struc or not 't' in struc: errors['syllable'][' '.join(segments), ' '.join(struc)] += [ (row['ID'], i, row['Language_ID'], row['Form'], row['Segments']) ] for error, errorname in [('length', 'Length Errors'), ('missing', 'Missing Values'), ('syllable', 'Syllable Errors')]: if errors[error]: print('# ' + errorname + '\n') table = [] for i, ((segments, structure), examples) in enumerate(errors[error].items()): table += [[i + 1, segments, structure, len(examples)]] print( tabulate( table, tablefmt='pipe', headers=['Number', 'Segments', 'Structure', 'Examples'])) print('')
def cmd_makecldf(self, args): """ Convert the raw data to a CLDF dataset. """ concepts, wl_concepts = {}, {} visited = set() for concept in self.concepts: cid = '{0}_{1}'.format(concept['NUMBER'], slug(concept['ENGLISH'])) if cid in visited: pass else: visited.add(cid) args.writer.add_concept( ID=cid, Name=concept['ENGLISH'], Glosses_in_Source=concept['GLOSSES_IN_SOURCE'], Concepticon_ID=concept['CONCEPTICON_ID'], Concepticon_Gloss=concept['CONCEPTICON_GLOSS']) for gloss in concept['GLOSSES_IN_SOURCE'].split(' // '): concepts[gloss] = cid wl_concepts[gloss] = concept['ENGLISH'] languages = args.writer.add_languages(lookup_factory="Name_in_Source") args.writer.add_sources() # make a wordlist for edictor to inspect the data D = {0: ['doculect', 'concept', 'ipa', 'cogid']} idx = 1 for i, row in progressbar( enumerate( self.raw_dir.read_csv('data.tsv', delimiter='\t', dicts=True))): for language, lid in languages.items(): form = row[language].strip() if form: lexemes = args.writer.add_forms_from_value( Language_ID=lid, Parameter_ID=concepts[row['Meaning']], Value=form, Source='Holm2017') if lexemes: args.writer.add_cognate( lexeme=lexemes[0], Cognateset_ID=str(i + 1), Cognate_Detection_Method='expert', Source='Holm2017') D[idx] = [ language, wl_concepts[row['Meaning']], form, i + 1 ] idx += 1 Wordlist(D).output( 'tsv', filename=self.raw_dir.joinpath('wordlist').as_posix())
def cmd_makecldf(self, args): # add sources args.writer.add_sources() # add concepts concepts = args.writer.add_concepts( id_factory=lambda c: c.id.split("-")[-1] + "_" + slug(c.english), lookup_factory="Name", ) # fix concept concepts["fat (n.)"] = concepts["fat n."] concepts["burn (tr.)"] = concepts["burn tr."] concepts["to fly"] = concepts["fly v."] concepts["lie (down)"] = concepts["lie"] concepts["walk (go)"] = concepts["walk(go)"] args.log.info("added concepts") # add languages languages = {} for language in self.languages: args.writer.add_language(**language) languages[language["Name"]] = language["ID"] args.log.info("added languages") # add data for row_ in progressbar( self.raw_dir.read_csv("Peiros2004-data by etymology.txt", delimiter="\t")): if "".join(row_).strip(): row = dict( zip(["CONCEPT", "SUBGROUP", "LANGUAGE", "FORM", "COGNACY"], row_)) bsource = "" if row["COGNACY"].isdigit(): cogid = int(row["COGNACY"]) elif row["COGNACY"].startswith("<"): bsource = row["COGNACY"].split(" ")[1] cogid = 0 else: cogid = 0 for lexeme in args.writer.add_forms_from_value( Parameter_ID=concepts[re.sub("'", "", row["CONCEPT"])], Language_ID=languages[row["LANGUAGE"].strip()], Value=row["FORM"].strip(), Source=["Peiros2004a"], LoanSource=bsource, Loan=True if bsource else False, ): args.writer.add_cognate( lexeme, Cognateset_ID=cogid, Source=["Peiros2004a"], )
def run(args): ds = Dataset(args) wl = Wordlist.from_cldf(str(ds.cldf_specs().metadata_path)) print('loaded wordlist') for idx, form, tokens in wl.iter_rows('form', 'tokens'): if str(tokens).endswith('+') or str(tokens).startswith('+'): print(idx, tokens) elif '+ +' in str(tokens): print(idx, form, tokens) wl.add_entries( 'structure', 'tokens', lambda x: basictypes.lists(' + '.join( [' '.join(y) for y in segments.get_structure(x)]))) errors = [] for idx, doculect, concept, value, form, tokens, structure in progressbar( wl.iter_rows('doculect', 'concept', 'value', 'form', 'tokens', 'structure')): if len(tokens.n) != len(structure.n): print('Wrong Length: {0} // {1}'.format(tokens, structure)) for tok, struc in zip(tokens.n, structure.n): error = '' if len(tok) != len(struc): error = 'wrong length' elif not 'n' in struc: error = 'missing vowel' #elif struc[0] == 'm': # error = 'medial as initial' if error.strip(): errors += [[ idx, doculect, concept, value, form, tok, struc, error ]] table = sorted(errors, key=lambda x: (x[-1], x[-2], x[1])) for i, line in enumerate(table): table[i] = [i + 1] + line print( tabulate(table, headers=[ 'Count', 'ID', 'Doculect', 'Concept', 'Value', 'Form', 'Token', 'Structure', 'Error' ], tablefmt='pipe')) morphemes = set([(line[-4], str(line[-3]), str(line[-2])) for line in table]) for a, b, c in sorted(morphemes, key=lambda x: x[-2]): print(a + '\t' + b + '\t' + c)
def cmd_makecldf(self, args): args.writer.add_sources() language_lookup = args.writer.add_languages(lookup_factory="Name") concept_lookup = args.writer.add_concepts( id_factory=lambda x: x.id.split("-")[-1] + "_" + slug(x.english), lookup_factory="Name") wl = lingpy.Wordlist(self.raw_dir.joinpath("HSH-SCL.csv").as_posix()) for idx in pylexibank.progressbar(wl): args.writer.add_forms_from_value( Language_ID=language_lookup[wl[idx, "language"]], Value=wl[idx, "reflex"], Source=["SoHartmann1988"], Parameter_ID=concept_lookup[wl[idx, "concept"]], )
def cmd_makecldf(self, args): args.writer.add_sources() language_lookup = args.writer.add_languages(lookup_factory="Name") concept_lookup = args.writer.add_concepts( id_factory=lambda x: x.number + "_" + slug(x.english), lookup_factory="Name") for entry in pylexibank.progressbar( self.raw_dir.read_csv("clean_data.tsv", delimiter="\t", dicts=True)): args.writer.add_forms_from_value( Language_ID=language_lookup[entry["LANGUAGE"]], Parameter_ID=concept_lookup[entry["CONCEPT"]], Value=entry["VALUE"], Source=["Kraft1981"], )
def cmd_makecldf(self, args): data = self.raw_dir.read_csv("wordlists.csv", dicts=True) args.writer.add_sources() languages = args.writer.add_languages() concepts = args.writer.add_concepts( id_factory=lambda c: c.id.split("-")[-1] + "_" + slug(c.english), lookup_factory="Name") for row in pylexibank.progressbar(data): for language in languages: args.writer.add_forms_from_value( Language_ID=language, Parameter_ID=concepts[row["English"]], Value=row[language], Source=["Tolmie1884"], )
def cmd_makecldf(self, args): data = self.raw_dir.read_csv('st-data.tsv', delimiter='\t', dicts=True) args.writer.add_sources() # note: no way to easily replace this with the direct call to `add_concepts` # as we add the Chinese gloss via concept.attributes concepts = {} for concept in self.conceptlists[0].concepts.values(): idx = concept.id.split("-")[-1] + "_" + slug(concept.english) args.writer.add_concept( ID=idx, Name=concept.english, Number=concept.number, Concepticon_ID=concept.concepticon_id, Concepticon_Gloss=concept.concepticon_gloss, ) concepts[concept.number] = idx languages = args.writer.add_languages(lookup_factory="Abbreviation") visited = set() for row in pylexibank.progressbar(data[1:], desc='cldfify', total=len(data)): for language in map(lambda x: x.upper(), languages): if language in row: if row[language].strip(): entry = clean_entry(row[language]) if entry.strip(): lexeme = args.writer.add_form( Language_ID=languages[language.lower()], Parameter_ID=concepts[row['NUMBER']], Value=row[language], Form=entry, Source='Peiros2004') args.writer.add_cognate( lexeme=lexeme, Cognateset_ID="{0}-{1}".format( row['NUMBER'], row[language + 'NUM']), Source='Peiros2004') else: if language not in visited: visited.add(language) print(language)
def cmd_makecldf(self, args): data = self.raw_dir.read_csv("raw.csv", dicts=True) args.writer.add_sources() languages = args.writer.add_languages(lookup_factory="Name") concepts = args.writer.add_concepts( id_factory=lambda c: c.id.split("-")[-1] + "_" + slug(c.english), lookup_factory="Name") for row in pylexibank.progressbar(data): for language, lexeme in row.items(): if language in languages: args.writer.add_forms_from_value( Language_ID=languages[language], Parameter_ID=concepts[row["Gloss"]], Value=lexeme, Source="Othaniel2017", )
def cmd_makecldf(self, args): data = self.raw_dir.read_csv('raw.csv', dicts=True) languages, concepts = {}, {} for concept in self.conceptlists[0].concepts.values(): idx = concept.id.split('-')[-1] + '_' + slug(concept.gloss) args.writer.add_concept( ID=idx, Name=concept.gloss, Concepticon_ID=concept.concepticon_id, Concepticon_Gloss=concept.concepticon_gloss, Chinese_Gloss=concept.attributes['chinese']) concepts[concept.attributes['chinese']] = idx args.writer.add_languages() languages = args.writer.add_languages(lookup_factory='Name') args.writer.add_sources() # add the tones for the segmented entries tones = {( row['Language_ID'], row['Tone'] ): row['Tone_category']+'/'+row['Tone'] for row in \ self.raw_dir.read_csv( 'hm-tones.tsv', delimiter='\t', dicts=True )} missing = {} for cgloss, entry in progressbar(enumerate(data), desc='cldfify the data', total=len(data)): if entry['Chinese gloss'] in concepts.keys(): for language in languages: if entry[language].strip(): lexemes = args.writer.add_lexemes( Language_ID=languages[language], Parameter_ID=concepts[entry['Chinese gloss']], Value=entry[language], Source=['Chen2012']) for lexeme in lexemes: lexeme['Segments'] = [ tones.get((lexeme['Language_ID'], s), s) for s in lexeme['Segments'] ] else: missing[entry["Chinese gloss"]] += 1
def cmd_makecldf(self, args): wl = lingpy.Wordlist(str(self.raw_dir / "sino-tibetan-raw.tsv")) args.writer.add_sources() concepts = {} for concept in self.conceptlists[0].concepts.values(): idx = concept.id.split("-")[-1] + "_" + slug(concept.english) args.writer.add_concept( ID=idx, TBL_ID=concept.attributes["huang_1992_1820"], Name=concept.english, Coverage=concept.attributes["coverage"], Concepticon_ID=concept.concepticon_id, Concepticon_Gloss=concept.concepticon_gloss, ) concepts[concept.english] = idx languages, sources = {}, {} for language in self.languages: args.writer.add_language(**language) languages[language["Name_in_Source"]] = language["ID"] sources[language["Name_in_Source"]] = language["Source"] for idx in pylexibank.progressbar(wl, desc="cldfify"): if wl[idx, "tokens"] and " ".join(wl[idx, "tokens"]).strip("+"): row = args.writer.add_form( Language_ID=languages[wl[idx, "doculect"]], Local_ID=idx, Parameter_ID=concepts[wl[idx, "concept"]], Value=wl[idx, "entry_in_source"].strip() or "".join(wl[idx, "tokens"]) or wl[idx, "ipa"], Form=".".join(wl[idx, "tokens"]), Source=sources[wl[idx, "doculect"]].split(","), Comment=wl[idx, "note"], Cognacy=wl[idx, "cogid"], Loan=True if wl[idx, "borrowing"].strip() else False, ) args.writer.add_cognate( lexeme=row, Cognateset_ID="{0}-{1}".format(wl[idx, "cogid"], slug(wl[idx, "concept"])), Source="Sagart2018", Alignment="", Alignment_Source="", )
def cmd_makecldf(self, args): # Read raw data wl = lingpy.Wordlist(self.raw_dir.joinpath("YN-RGLD.csv").as_posix()) args.writer.add_sources() concept_lookup = args.writer.add_concepts( id_factory=lambda x: x.id.split("-")[-1] + "_" + slug(x.english), lookup_factory="SrcId") language_lookup = args.writer.add_languages(lookup_factory="Name") # add lexemes for idx, language, concept, value in pylexibank.progressbar( wl.iter_rows("doculect", "srcid", "reflex"), desc="make-cldf"): if language in language_lookup and concept in concept_lookup: args.writer.add_forms_from_value( Language_ID=language_lookup[language], Parameter_ID=concept_lookup[concept], Value=value, Source=["Nagano2013"], )
def cmd_makecldf(self, args): concepts = {} for concept in self.conceptlists[0].concepts.values(): idx = concept.id.split("-")[-1] + "_" + slug(concept.english) concepts[concept.number] = idx args.writer.add_concept( ID=idx, Name=concept.english, NUMBER=concept.number, Concepticon_ID=concept.concepticon_id, Concepticon_Gloss=concept.concepticon_gloss, Swahili_Gloss=concept.attributes["swahili"], ) # Add sources args.writer.add_sources() # Add languages language_lookup = args.writer.add_languages(lookup_factory="Name") # TODO: add STEM and PREFIX? (pay attention to multiple forms) for entry in pylexibank.progressbar(self.raw_dir.read_csv("tls.txt", dicts=True)): # Skip over when language is "Note" (internal comments) or # "Gweno1" (a copy of "Gweno") if entry["LGABBR"] in ["Note", "Gweno1"]: continue src_idx = entry["SRCID"].replace(".0", "").replace(".5", "a") # Fix values if possible (for common problems not in lexemes.csv) value = entry["REFLEX"] if src_idx not in concepts: continue args.writer.add_forms_from_value( Language_ID=language_lookup[entry["LGABBR"]], Parameter_ID=concepts[src_idx], Value=value, Source=["Nurse1975", "Nurse1979", "Nurse1980", "TLS1999"], )
def cmd_makecldf(self, args): wl = lingpy.Wordlist( self.raw_dir.joinpath('D_old-clics.tsv').as_posix()) args.log.info('loaded wordlist') src = { 'wold': 'Wold2009', 'ids': 'Key2007', 'logos': 'Logos2008', 'Språkbanken': 'Saxena2013' } args.writer.add_sources() concepts = set() languages = set() concepticon = { c.id: c.gloss for c in Concepticon().conceptsets.values() } args.log.info('added concepticon') for k in progressbar(wl, desc='wl-to-cldf'): if wl[k, 'value']: if wl[k, 'doculect'] not in languages: args.writer.add_language(ID=slug(wl[k, 'doculect'], lowercase=False), Name=wl[k, 'doculect'], Glottocode=wl[k, 'glottolog']) languages.add(wl[k, 'doculect']) if wl[k, 'concept'] not in concepts: args.writer.add_concept( ID=slug(wl[k, 'concept'], lowercase=False), Name=wl[k, 'concept'], Concepticon_ID=wl[k, 'concepticon_id'], Concepticon_Gloss=concepticon.get( wl[k, 'concepticon_id'], '')) concepts.add(wl[k, 'concept']) args.writer.add_lexemes(Language_ID=slug(wl[k, 'doculect'], lowercase=False), Parameter_ID=slug(wl[k, 'concept'], lowercase=False), Value=wl[k, 'value'], Source=src.get(wl[k, 'source'], ''))
def cmd_makecldf(self, args): conn = sqlite3.connect((self.raw_dir / "tryon.db").as_posix()) cursor = conn.cursor() cursor.execute(QUERY) args.writer.add_sources() languages = args.writer.add_languages(lookup_factory="Name") concepts = args.writer.add_concepts( id_factory=lambda c: c.id.split('-')[-1] + '_' + slug(c.english), lookup_factory="Name") for lang, param, value in progressbar(cursor.fetchall()): if value: args.writer.add_forms_from_value( Language_ID=languages[lang], Parameter_ID=concepts[param], Value=self.lexemes.get(value, value).strip(), Source=['Tryon1983'], ) conn.close()
def cmd_makecldf(self, args): data = self.raw_dir.read_csv("data.tsv", delimiter="\t", dicts=True) args.writer.add_sources() concepts = {} for concept in self.conceptlists[0].concepts.values(): idx = '{0}_{1}'.format(concept.number, slug(concept.english)) args.writer.add_concept( ID=idx, Name=concept.english, Number=concept.english, Variants=concept.attributes["lexibank_gloss"], ) for variant in concept.attributes["lexibank_gloss"]: concepts[variant] = idx concepts[concept.english] = idx languages = args.writer.add_languages(lookup_factory="Name") # Only instance where the variant is switched, so we fix that manually. concepts["duck²⁹"] = "51_duck" for i, row in progressbar(enumerate(data)): for language in self.languages: if language != "Tangut": entry = row.get(language["Name"]) if not entry: entry = row.get(language["Name"] + "_form") concept = concepts.get(row.get(language["Name"] + "_gloss")) if entry and concept and entry not in ["NA"] and concept not in ["NA"]: cogset = args.writer.add_forms_from_value( Language_ID=language["Name"], Parameter_ID=concept, Value=entry, Source="Sims2020", )[0] args.writer.add_cognate( cogset, Cognateset_ID=row["Set #1"], STEDT=str(row["STEDT # "] if "STEDT # " in row else ""), Source="Sims2020", )
def cmd_makecldf(self, args): wl = lingpy.Wordlist(str(self.raw_dir / 'signalphabets.tsv')) concepts, sources = {}, {} for i, c in enumerate(wl.rows): args.writer.add_concept( ID=str(i + 1), Name=c, ) concepts[c] = str(i + 1) for language in self.languages: args.writer.add_language( ID=language['Name_in_Database'], Name=language['Name'], Latitude=language['Latitude'], Longitude=language['Longitude'], Glottocode=language['Glottolog'], SubGroup=language['SubGroup'], ) sources[language['Name_in_Database']] = language['Source'] sources['Ukranian_SL'] = 'Lydell2018' languages = {language: language for language in sources} languages['Ukranian_SL'] = 'Ukrainian_SL' args.writer.add_sources( *[x for x in self.raw_dir.read_bib() if x.id in sources]) for i, c, l, h1, h2, t, cid in progressbar(wl.iter_rows( 'concept', 'doculect', 'handshape_1', 'handshape_2', 'tokens', 'cogid'), desc='makecldf'): row = args.writer.add_form(Value=h1 + ' ' + h2, Language_ID=languages[l], Parameter_ID=concepts[c], Form=' '.join(t), Source=sources[l]) args.writer.add_cognate( lexeme=row, Cognateset_ID=cid, )