def test_ipa2tokens(alm): # iterate over the keys for key in alm: # get_list(language="Turkish",flat=True): ipa = alm[key, 'ipa'] tokens_a = alm[key, 'tokensa'].split(' ') tokens_b = alm[key, 'tokensb'].split(' ') new_tokens_a = lp.ipa2tokens(ipa, merge_vowels=True, merge_geminates=False) new_tokens_b = lp.ipa2tokens(ipa, merge_vowels=False, merge_geminates=False) assert tokens_a == new_tokens_a assert tokens_b == new_tokens_b
def test_ipa2tokens(self): # iterate over the keys for key in self.alm: #.get_list(language="Turkish",flat=True): ipa = self.alm[key, 'ipa'] tokensA = self.alm[key, 'tokensa'].split(' ') tokensB = self.alm[key, 'tokensb'].split(' ') new_tokensA = lp.ipa2tokens(ipa, merge_vowels=True) new_tokensB = lp.ipa2tokens(ipa, merge_vowels=False) assert tokensA == new_tokensA assert tokensB == new_tokensB
def tokenize_word_reversibly(ipa): """Reversibly convert an IPA string into a list of tokens. In contrast to LingPy's tokenize_word, do this without removing symbols. This means that the original IPA string can be recovered from the tokens. """ tokenized_word = ipa2tokens(ipa, merge_vowels=False, merge_geminates=False) token = 0 index = 0 for i in ipa: try: tokenized_word[token][index] except IndexError: token += 1 index = 0 try: if i != tokenized_word[token][index]: if index == 0: tokenized_word.insert(token, i) else: tokenized_word[token] = (tokenized_word[token][:index] + i + tokenized_word[token][index:]) except IndexError: tokenized_word.append(i) index += 1 assert ''.join(tokenized_word) == ipa return tokenized_word
def read_data_cldf(datafile, sep="\t", char_list=set(), cogids_are_cross_semantically_unique=True, data='ASJP'): """Read a CLDF file in TSV or CSV format.""" reader = csv.DictReader(datafile, dialect='excel' if sep == ', ' else 'excel-tab') langs = set() data_dict = collections.defaultdict(lambda: collections.defaultdict()) cogid_dict = collections.defaultdict(lambda: collections.defaultdict()) words_dict = collections.defaultdict(lambda: collections.defaultdict(list)) for line, row in enumerate(reader): lang = row["Language ID"] langs.add(lang) if data == 'ASJP': try: asjp_word = clean_word(row["ASJP"]) except KeyError: asjp_word = ipa2asjp.ipa2asjp(row["IPA"]) elif data == 'IPA': asjp_word = tuple(lingpy.ipa2tokens(row["IPA"], merge_vowels=False)) else: asjp_word = row[data] if not asjp_word: continue for ch in asjp_word: if ch not in char_list: char_list.add(ch) concept = row["Feature ID"] cogid = row["Cognate Class"] data_dict[concept][line, lang] = asjp_word cogid_dict.setdefault( cogid if cogids_are_cross_semantically_unique else (cogid, concept), set()).add((lang, concept, asjp_word)) words_dict[concept].setdefault(lang, []).append(asjp_word) return (data_dict, list(cogid_dict.values()), words_dict, list(langs), char_list)
def get_structure(word, sep='+', zipped=False, semi_diacritics='hsʃʂʒʐzθɕʑfvθðnmȵ'): if not isinstance(word, (list, tuple)): word = lingpy.ipa2tokens(word, expand_nasals=True, merge_vowels=False, semi_diacritics=semi_diacritics) # check for unknown chars try: tokens2class(word, 'cv', cldf=True) except ValueError: print('problem with {0}'.format(''.join(word))) return [] # get the morphemes if sep in word: words = tokens2morphemes(word, cldf=True) morphemes = [] for w in words: morphemes += tokens2morphemes(w, sep=sep) else: morphemes = tokens2morphemes(word, cldf=True) # get the basic structure for each morpheme for morpheme in morphemes: try: segments = parse_chinese_morphemes(morpheme) except: if not zipped: yield ['NULL'] else: yield ([('NULL', 'NULL')], morpheme) if not zipped: yield [x for x, y in zip('imnct', segments) if y != '-'] else: yield ([x for x in zip('imnct', segments) if x[1] != '-'], morpheme)
output[['Taxon','Gloss']] = output[['Taxon','Gloss']].astype('string') output['dbID'] = [db+'_'+str(x-1) for x in output.ID.values] output.to_csv('reformattedData/asjp/'+db+'.tsv',encoding='utf-8', sep='\t',index=False) for f in [x for x in os.listdir(path+'data/list_length_project/sets/mattis_new/output') if x!='.svn']: db = f.split('.')[0] data = pd.read_table(path+'/data/list_length_project/sets/mattis_new/output/'+f,encoding='utf-8') data = data[['-' not in unicode(x) for x in data.cognate_class.values]] output = pd.DataFrame() output['ID'] = arange(len(data))+1 output['Taxon'] = data.language.values output['Gloss'] = data.gloss.values output['GlossID'] = pd.match(data.gloss.values,data.gloss.unique())+1 output['IPA'] = [re.sub(r"[ -]","",unicode(x)) for x in data.transcription] output['Tokens'] = [' '.join(lp.ipa2tokens(unicode(w))) for w in output.IPA] cClasses = array([x+':'+unicode(y).strip('?') for (x,y) in data[['gloss','cognate_class']].values]) output['CogID'] = pd.match(cClasses,unique(cClasses)) output[['Taxon','Gloss']] = output[['Taxon','Gloss']] output['dbID'] = [db+'_'+str(x-1) for x in output.ID.values] output.to_csv('reformattedData/ipa/'+db+'.tsv',encoding='utf-8', sep='\t',index=False) for f in [x for x in os.listdir(path+'data/list_length_project/sets/abvd2/output') if x!='.svn']: db = f.split('.')[0] data = pd.read_table(path+'/data/list_length_project/sets/abvd2/output/'+f,encoding='utf-8') data = data[['-' not in unicode(x) for x in data.cognate_class.values]] data = data[[',' not in unicode(x) for x in data.cognate_class.values]] data = data[['?' not in unicode(x) for x in data.cognate_class.values]]
def parse_chinese_morphemes(seq, context=False): """ Parse a Chinese syllable and return its basic structure. """ # get the tokens if isinstance(seq, list): tokens = [s for s in seq] else: tokens = lingpy.ipa2tokens(seq, merge_vowels=False) # get the sound classes according to the art-model arts = [int(x) for x in lingpy.tokens2class(tokens, _art, cldf=True)] # get the pro-string prostring = lingpy.prosodic_string(arts) # parse the zip of tokens and arts I, M, N, C, T = '', '', '', '', '' ini = False med = False nuc = False cod = False ton = False triples = [('?', '?', '?')] + list(zip(tokens, arts, prostring)) + [('?', '?', '?')] for i in range(1, len(triples) - 1): #enumerate(triples[1:-1]): #zip(tokens,arts,prostring): t, c, p = triples[i] _t, _c, _p = triples[i - 1] t_, c_, p_ = triples[i + 1] # check for initial entry first if p == 'A' and _t == '?': # now, if we have a j-sound and a vowel follows, we go directly to # medial environment if t[0] in 'jɥw': med = True ini, nuc, cod, ton = False, False, False, False else: ini = True med, nuc, doc, ton = False, False, False, False # check for initial vowel elif p == 'X' and _t == '?': if t[0] in 'iuy' and c_ == '7': med = True ini, nuc, cod, ton = False, False, False, False else: nuc = True ini, med, cod, ton = False, False, False, False # check for medial after initial elif p == 'C': med = True ini, nuc, cod, ton = False, False, False, False # check for vowel medial elif p == 'X' and p_ == 'Y': # if we have a medial vowel, we classify it as medial if t in 'iyu': med = True ini, nuc, cod, ton = False, False, False, False else: nuc = True ini, med, cod, ton = False, False, False, False # check for vowel without medial elif p == 'X' or p == 'Y': if p_ in 'LTY' or p_ == '?': nuc = True ini, med, cod, ton = False, False, False, False elif p == 'Y': nuc = True ini, med, cod, ton = 4 * [False] else: cod = True ini, med, nuc, ton = 4 * [False] # check for consonant elif p == 'L': cod = True ini, med, nuc, ton = 4 * [False] # check for tone elif p == 'T': ton = True ini, med, nuc, cod = 4 * [False] if ini: I += t elif med: M += t elif nuc: N += t elif cod: C += t else: T += t # bad conversion for output, but makes what it is supposed to do out = [I, M, N, C, T] tf = lambda x: x if x else '-' out = [tf(x) for x in out] # transform tones to normal letters tones = dict(zip('¹²³⁴⁵⁶⁷⁸⁹⁰₁₂₃₄₅₆₇₈₉₀', '1234567890123456789')) # now, if context is wanted, we'll yield that ic = '1' if [x for x in I if x in 'bdgmnŋȵɳɴ'] else '0' mc = '1' if [m for m in M + N if m in 'ijyɥ'] else '0' cc = '1' if C in 'ptkʔ' else '0' tc = ''.join([tones.get(x, x) for x in T]) IC = '/'.join(['I', ic, mc, cc, tc]) if I else '' MC = '/'.join(['M', ic, mc, cc, tc]) if M else '' NC = '/'.join(['N', ic, mc, cc, tc]) if N else '' CC = '/'.join(['C', ic, mc, cc, tc]) if C else '' TC = '/'.join(['T', ic, mc, cc, tc]) if T else '' if context: return out, [x for x in [IC, MC, NC, CC, TC] if x]
def get_tokenizer(): return lambda x, y: ipa2tokens(y, merge_vowels=False)
def cldf(dataset, concepticon, **kw): """ Implements the conversion of the raw data to CLDF dataset(s). :param dataset: provides access to the information in supplementary files as follows:\ - the JSON object from `metadata.json` is available as `dataset.md`\ - items from languages.csv are available as `dataset.languages`\ - items from concepts.csv are available as `dataset.concepts`\ - if a Concepticon conceptlist was specified in metadata.json, its ID is available\ as `dataset.conceptlist` :param glottolog: a pyglottolog.api.Glottolog` instance. :param concepticon: a pyconcepticon.api.Concepticon` instance. :param kw: All arguments passed on the command line. """ wl = lp.Wordlist(dataset.raw.joinpath(DSET).as_posix()) # get language identifiers lids, cids, coords = {}, {}, {} for row in dataset.languages: language = row['NAME'] lids[language] = row['GLOTTOCODE'] coords = dict([wl.coords[taxon] for taxon in lids]) modify = { 'thunder (verb)': 'thunder', 'flash (verb)': 'lightning', 'room': 'flat', 'have diarrea': 'have diarrhoea', 'watery': 'light' } for row in dataset.concepts: concept = modify[row['CONCEPT']] if row['CONCEPT'] in modify else \ row['CONCEPT'] cids[concept] = row['CONCEPT_SET'] # language ids src = getEvoBibAsSource(SOURCE) src2 = getEvoBibAsSource('List2014b') # get partial identifiers partial_ids = defaultdict(list) partial_converter = {} idx = 1 for k in wl: for char in wl[k, 'counterpart']: if char in partial_converter: pidx = partial_converter[char] else: pidx = idx partial_converter[char] = idx idx += 1 partial_ids[k] += [pidx] # trace if proto-langugages was visited visited = [] idx = max([k for k in wl]) + 1 with CldfDataset( ('ID', 'Language_ID', 'Language_name', 'Language_iso', 'Parameter_ID', 'Parameter_name', 'Parameter_Chinese_name', 'Value', 'Value_Chinese_characters', 'Source', 'Segments', 'Cognacy', 'Rank', 'Comment'), dataset) as ds: ds.sources.add(src) ds.sources.add(src2) D = {0: ['doculect', 'concept', 'ipa', 'tokens', 'cogid']} for k in wl: tokens = lp.ipa2tokens(wl[k, 'ipa'], merge_vowels=False, expand_nasals=True) # remove sandhi-annotation in tokens, as it is confusing clpa for i, t in enumerate(tokens): if '⁻' in t: tokens[i] = t[:t.index('⁻')] ds.add_row([ '{0}-{1}'.format(SOURCE, k), lids[wl[k, 'doculect']], wl[k, 'doculect'], '', cids[wl[k, 'concept']], wl[k, 'concept'], wl[k, 'mandarin'], wl[k, 'ipa'], wl[k, 'counterpart'], SOURCE, ' '.join(tokens), wl[k, 'cogid'], wl[k, 'order'], wl[k, 'note'] if wl[k, 'note'] != '-' else '', ]) D[k] = [ wl[k, 'doculect'], wl[k, 'concept'], wl[k, 'ipa'], tokens, wl[k, 'cogid'] ] if wl[k, 'cogid'] not in visited: # we need to add new tones, otherwise it won't work, so we # split syllables first, then check if the syllable ends with # tone or not and add a '1' if this is not the case syllables = wl[k, 'mch'].split('.') for i, s in enumerate(syllables): if s[-1] not in '²³': if s[-1] not in 'ptk': syllables[i] += '¹' else: syllables[i] += '⁴' tokens = lp.ipa2tokens(''.join(syllables)) ds.add_row([ '{0}-{1}'.format(wl[k, 'concept'], idx), 'sini1245', 'Middle Chinese', '', cids[wl[k, 'concept']], wl[k, 'concept'], '', wl[k, 'proto'], wl[k, 'counterpart'], SOURCE, ' '.join(tokens), wl[k, 'cogid'], '', '' ]) D[idx] = [ 'Middle Chinese', wl[k, 'concept'], wl[k, 'mch'], tokens, wl[k, 'cogid'] ] idx += 1 visited += [wl[k, 'cogid']] alms = lp.Alignments(D) cognates = [[ '{0}-{1}'.format(SOURCE, k), ds.name, alms[k, 'ipa'], '-'.join([slug(alms[k, 'concept']), str(alms[k, 'cogid'])]), '', 'expert', SOURCE, '', '', '' ] for k in alms] dataset.cognates.extend( iter_alignments(alms, cognates, method='library'))