def evaluate_string(self, string, tokens=False, **keywords): setdefaults(keywords, stress=rcParams['stress'], diacritics=rcParams['diacritics'], cldf=False) if not tokens: tokens = ipa2tokens(string) score = 1 dist = self.dist['#'] prostring = prosodic_string(tokens, rcParams['art'], cldf=keywords['cldf'], diacritics=keywords['diacritics'], stress=keywords['stress']) if self.classes: c = tokens2class(tokens, self.model, cldf=keywords['cldf'], diacritics=keywords['diacritics'], stress=keywords['stress']) teststring = list(zip(prostring, c)) else: teststring = list(zip(prostring, tokens)) scores = [] while len(teststring) > 0: segment = teststring.pop(0) freq = dist.count(segment) allf = len(dist) s = freq / allf score = score * s scores += [s] dist = self.dist[segment] score = score * s scores += [s] lscore = np.log10(score) lscore = lscore / len(tokens) return score, lscore # np.log10(score)
def evaluate_string(self, string, tokens=False, **keywords): setdefaults(keywords, stress=rcParams['stress'], diacritics=rcParams['diacritics'], cldf=False) if not tokens: tokens = ipa2tokens(string) score = 1 dist = self.dist['#'] prostring = prosodic_string(tokens, rcParams['art'], cldf=keywords['cldf'], diacritics=keywords['diacritics'], stress=keywords['stress']) if self.classes: c = tokens2class(tokens, self.model, cldf=keywords['cldf'], diacritics=keywords['diacritics'], stress=keywords['stress']) teststring = list(zip(prostring, c)) else: teststring = list(zip(prostring, tokens)) scores = [] while len(teststring) > 0: segment = teststring.pop(0) freq = dist.count(segment) allf = len(dist) s = freq / allf score = score * s scores += [s] dist = self.dist[segment] score = score * s scores += [s] lscore = np.log10(score) lscore = lscore / len(tokens) return score, lscore # np.log10(score)
def __init__(self, words, tokens=False, prostrings=[], classes=False, class_model=rcParams['model'], **keywords): setdefaults(keywords, stress=rcParams['stress'], diacritics=rcParams['diacritics'], cldf=False) self.model = class_model self.words = words self.tokens = [] self.bigrams = [] self.classes = [] # start filling the dictionary for i, w in enumerate(words): # check for tokenized string if not tokens: tk = ipa2tokens(w, **keywords) else: tk = w[:] self.tokens += [tk] # create prosodic string if prostrings: p = prostrings[i] else: print(w, tk) tt = tokens2class(tk, rcParams['art']) print(tt) p = prosodic_string(tk, rcParams['art'], cldf=keywords['cldf'], diacritics=keywords['diacritics'], stress=keywords['stress']) # create classes if classes: c = tokens2class(tk, class_model, cldf=keywords['cldf'], diacritics=keywords['diacritics'], stress=keywords['stress']) bigrams = list(zip(p, c)) self.classes += [c] else: # zip the stuff bigrams = list(zip(p, tk)) # start appending the stuff self.bigrams += [bigrams] # init the mother object MCBasic.__init__(self, self.bigrams)
def turchin(seqA, seqB, model='dolgo', **keywords): """ Return cognate judgment based on the method by :evobib:`Turchin2010`. Parameters ---------- seqA, seqB : {str, list, tuple} The input strings. These should be iterables, so you can use tuples, lists, or strings. model : {"asjp", "sca", "dolgo"} (default="dolgo") A sound-class model instance or a string that denotes one of the standard sound class models used in LingPy. Returns ------- cognacy : {0, 1} The cognacy assertion which is either 0 (words are probably cognate) or 1 (words are not likely to be cognate). """ if text_type(model) == model: model = rcParams[model] elif hasattr(model, 'info'): pass else: raise ValueError("[!] No valid model instance selected.") if isinstance(seqA, (text_type, str)): seqA = ipa2tokens(seqA) seqB = ipa2tokens(seqB) classA = tokens2class(seqA, model) classB = tokens2class(seqB, model) if classA[0] in model.vowels: classA[0] = 'H' if classB[0] in model.vowels: classB[0] = 'H' if ''.join([k for k in classA if k not in model.vowels])[:2] == \ ''.join([k for k in classB if k not in model.vowels])[:2]: return 0 else: return 1
def ipa_to_asjp(w): """ Lingpy IPA-to-ASJP converter plus some cleanup. This function is called on IPA datasets. """ w = w.replace('\"', '').replace('-', '').replace(' ', '') wA = ''.join(tokens2class(ipa2tokens(w, merge_vowels=False), 'asjp')) wAA = clean_asjp(wA.replace('0', '').replace('I', '3').replace('H', 'N')) asjp = ''.join([x for x in wAA if x in sounds]) return asjp
def i2t(s, merge_vowels: bool = True): tokens = ipa2tokens(s, merge_vowels=merge_vowels, merge_geminates=True) ret = list() for token in tokens: l = len(token) # NOTE(j_luo) Merge geminates into one segment. if l % 2 == 0 and token[:l // 2] == token[l // 2:]: ret.append(token[:l // 2] + 'ː') else: ret.append(token) return ret
def __init__( self, words, tokens=False, prostrings=[], classes=False, class_model=rcParams['model'], **keywords ): setdefaults(keywords, stress=rcParams['stress'], diacritics=rcParams['diacritics'], cldf=False) self.model = class_model self.words = words self.tokens = [] self.bigrams = [] self.classes = [] # start filling the dictionary for i, w in enumerate(words): # check for tokenized string if not tokens: tk = ipa2tokens(w, **keywords) else: tk = w[:] self.tokens += [tk] # create prosodic string if prostrings: p = prostrings[i] else: tt = tokens2class(tk, rcParams['art']) p = prosodic_string( tk, rcParams['art'], cldf=keywords['cldf'], diacritics=keywords['diacritics'], stress=keywords['stress']) # create classes if classes: c = tokens2class(tk, class_model, cldf=keywords['cldf'], diacritics=keywords['diacritics'], stress=keywords['stress']) bigrams = list(zip(p, c)) self.classes += [c] else: # zip the stuff bigrams = list(zip(p, tk)) # start appending the stuff self.bigrams += [bigrams] # init the mother object MCBasic.__init__(self, self.bigrams)
def i2t(ipa): ipa = unicodedata.normalize('NFD', ipa) ipa = re.sub(r'^\*', '', ipa) tokens = ipa2tokens(ipa, merge_vowels=False, merge_geminates=False) ret = list() for t in tokens: # NOTE(j_luo) Stress symbol is not handled by `ipapy`'s canonicalization process. t = t.replace("'", 'ˈ') # NOTE(j_luo) Not sure what these symbols mean. t = t.replace('̣', '').replace('̧', '').replace('̦', '') ret.append(str(IPAString(unicode_string=t))) return ret
def test_ipa2tokens(): seq = 'ˈtʲʰoɔːix_tərp͡f¹¹' assert len(ipa2tokens(seq)) != len(list(seq)) seq = 'ʰto͡i' assert len(ipa2tokens(seq)) == 2 seq = 'th o x t a' assert len(ipa2tokens(seq)) == len(seq.split(' ')) seq = '# b l a #' assert len(ipa2tokens(seq)) == len(seq.split(' '))-2 # now check with all possible data we have so far tokens = csv2list(test_data('test_tokenization.csv')) for a,b in tokens: tks1 = ' '.join(ipa2tokens(a)) tks2 = ' '.join(ipa2tokens(a, merge_vowels=False)) # we check for two variants, since we don't know whether vowels are # merged or not in the test data assert tks1 == b or tks2 == b
def test_ipa2tokens(): seq = 'ˈtʲʰoɔːix_tərp͡f¹¹' assert len(ipa2tokens(seq)) != len(list(seq)) seq = 'ʰto͡i' assert len(ipa2tokens(seq)) == 2 seq = 'th o x t a' assert len(ipa2tokens(seq)) == len(seq.split(' ')) seq = '# b l a #' assert len(ipa2tokens(seq)) == len(seq.split(' ')) - 2 # now check with all possible data we have so far tokens = csv2list(test_data('test_tokenization.csv')) for a, b in tokens: tks1 = ' '.join(ipa2tokens(a)) tks2 = ' '.join(ipa2tokens(a, merge_vowels=False)) # we check for two variants, since we don't know whether vowels are # merged or not in the test data assert tks1 == b or tks2 == b
def turchin(seqA, seqB, model='dolgo', **keywords): """ Return cognate judgment based on the method by :evobib:`Turchin2010`. Parameters ---------- seqA, seqB : {str, list, tuple} The input strings. These should be iterables, so you can use tuples, lists, or strings. model : {"asjp", "sca", "dolgo"} (default="dolgo") A sound-class model instance or a string that denotes one of the standard sound class models used in LingPy. Returns ------- cognacy : {0, 1} The cognacy assertion which is either 0 (words are probably cognate) or 1 (words are not likely to be cognate). """ if text_type(model) == model: model = rcParams[model] elif not hasattr(model, 'info'): raise ValueError("[!] No valid model instance selected.") if isinstance(seqA, string_types): seqA = ipa2tokens(seqA) seqB = ipa2tokens(seqB) classA = tokens2class(seqA, model) classB = tokens2class(seqB, model) if classA[0] in model.vowels: classA[0] = 'H' if classB[0] in model.vowels: classB[0] = 'H' return int(''.join([k for k in classA if k not in model.vowels])[:2] != ''.join([k for k in classB if k not in model.vowels])[:2])
def ipa_to_asjp(w, params): """ Lingpy IPA-to-ASJP converter plus some cleanup. Expects the params {} to contain the key: sounds. This function is called on IPA datasets. """ w = w.replace('\"', '').replace('-', '').replace(' ', '') wA = ''.join(tokens2class(ipa2tokens(w, merge_vowels=False), 'asjp')) wAA = clean_asjp(wA.replace('0', '').replace('I', '3').replace('H', 'N')) asjp = ''.join([x for x in wAA if x in params['sounds']]) assert len(asjp) > 0 return asjp
def test_ipa2tokens(): seq = 'ˈtʲʰoɔːix_tərp͡f¹¹' assert len(ipa2tokens(seq)) != len(list(seq)) seq = 'ʰto͡i' assert len(ipa2tokens(seq)) == 2 seq = 'th o x t a' assert len(ipa2tokens(seq)) == len(seq.split(' ')) seq = '# b l a #' assert len(ipa2tokens(seq)) == len(seq.split(' ')) - 2 # now check with all possible data we have so far, but only on cases where # tokenization doesn't require the merge_vowels = False flag tokens = csv2list(test_data('test_tokenization.tsv')) for a, b in tokens: tks = ' '.join(ipa2tokens(a)) # we check for two variants, since we don't know whether vowels are # merged or not in the test data assert tks == b # now test on smaller set with unmerged vowels tokens = csv2list(test_data('test_tokenization_mv.tsv')) for a, b in tokens: tks = ' '.join(ipa2tokens(a, merge_vowels=False, merge_geminates=False)) # we check for two variants, since we don't know whether vowels are # merged or not in the test data assert tks == b tokens = csv2list(test_data('test_tokenization_nasals.tsv')) for a, b in tokens: tks = ' '.join( ipa2tokens(a, merge_vowels=True, merge_geminates=True, expand_nasals=True, semi_diacritics='h')) assert tks == b
def test_ipa2tokens(): seq = 'ˈtʲʰoɔːix_tərp͡f¹¹' assert len(ipa2tokens(seq)) != len(list(seq)) seq = 'ʰto͡i' assert len(ipa2tokens(seq)) == 2 seq = 'th o x t a' assert len(ipa2tokens(seq)) == len(seq.split(' ')) seq = '# b l a #' assert len(ipa2tokens(seq)) == len(seq.split(' '))-2 # now check with all possible data we have so far, but only on cases where # tokenization doesn't require the merge_vowels = False flag tokens = csv2list(test_data('test_tokenization.tsv')) for a,b in tokens: tks = ' '.join(ipa2tokens(a)) # we check for two variants, since we don't know whether vowels are # merged or not in the test data assert tks == b # now test on smaller set with unmerged vowels tokens = csv2list(test_data('test_tokenization_mv.tsv')) for a,b in tokens: tks = ' '.join(ipa2tokens(a, merge_vowels=False, merge_geminates=False)) # we check for two variants, since we don't know whether vowels are # merged or not in the test data assert tks == b tokens = csv2list(test_data('test_tokenization_nasals.tsv')) for a,b in tokens: print(tks) tks = ' '.join(ipa2tokens(a, merge_vowels=True, merge_geminates=True, expand_nasals=True, semi_diacritics='h')) assert tks == b
def __init__(self, seqs, seqB=False, **keywords): # check, whether there are only two sequences or multiple sequence # pairs as input if seqB: self.seqs = [(seqs, seqB)] else: self.seqs = seqs # add the basic representation of sequences self.tokens = [] self.prostrings = [] # define a tokenizer function for convenience defaults = { "diacritics": rcParams['diacritics'], "vowels": rcParams['vowels'], "tones": rcParams['tones'], "combiners": rcParams['combiners'], "breaks": rcParams['breaks'], "stress": rcParams['stress'], "merge_vowels": rcParams['merge_vowels'] } for k in defaults: if k not in keywords: keywords[k] = defaults[k] tokenize = lambda x: ipa2tokens(x, **keywords) # start to loop over data and create the stuff for k, (seqA, seqB) in enumerate(self.seqs): # get the tokens tokA, tokB = tokenize(seqA), tokenize(seqB) # get the prostrings proA, proB = \ prosodic_string(tokA, **keywords), prosodic_string(tokB, **keywords) # append the stuff self.tokens += [[tokA, tokB]] self.prostrings += [[proA, proB]]
def __init__(self, seqs, seqB=False, **keywords): # check, whether there are only two sequences or multiple sequence # pairs as input if seqB: self.seqs = [(seqs, seqB)] else: self.seqs = seqs # add the basic representation of sequences self.tokens = [] self.prostrings = [] # define a tokenizer function for convenience defaults = { "diacritics": rcParams['diacritics'], "vowels": rcParams['vowels'], "tones": rcParams['tones'], "combiners": rcParams['combiners'], "breaks": rcParams['breaks'], "stress": rcParams['stress'], "merge_vowels": rcParams['merge_vowels'] } for k in defaults: if k not in keywords: keywords[k] = defaults[k] tokenize = lambda x: ipa2tokens(x, **keywords) # start to loop over data and create the stuff for k, (seqA, seqB) in enumerate(self.seqs): # get the tokens tokA, tokB = tokenize(seqA), tokenize(seqB) # get the prostrings proA, proB = \ prosodic_string(tokA, **keywords), prosodic_string(tokB, **keywords) # append the stuff self.tokens += [[tokA, tokB]] self.prostrings += [[proA, proB]]
def i2t(ipa: str) -> List[str]: """ipa2token call. Raises error if return is empty.""" ret = ipa2tokens(ipa, merge_vowels=True, merge_geminates=False) if not ret: raise I2tException return ret
def msa2html(msa, shorttitle='', filename='', template='', **keywords): """ Convert files in ``msa``-format into colored ``html``-format. Parameters ---------- msa : dict A dictionary object that contains all the information of an MSA object. shorttitle : str Define the shorttitle of the ``html``-page. If no title is provided, the default title ``SCA`` will be used. filename : str (default="") Define the name of the output file. If no name is defined, the name of the input file will be taken as a default. template : str (default="") The path to the template file. If no name is defined, the basic template will be used. The basic template currently used can be found under ``lingpy/data/templates/msa2html.html``. Examples -------- Load the libary. >>> from lingpy import * Load an ``msq``-file from the test-sets. >>> msa = MSA('harry.msq') Align the data progressively and carry out a check for swapped sites. >>> msa.prog_align() >>> msa.swap_check() >>> print(msa) w o l - d e m o r t w a l - d e m a r - v - l a d i m i r - Save the data to the file ``harry.msa``. >>> msa.output('msa',filename='harry') Save the ``msa``-object as ``html``. >>> msa.output('html',filename='harry') Notes ----- The coloring of sound segments with respect to the sound class they belong to is based on the definitions given in the ``color`` :py:class:`~lingpy.data.model.Model`. It can easily be changed and adapted. See also -------- lingpy.convert.html.alm2html """ util.setdefaults( keywords, pid_mode=1, stress=rcParams['stress'], css=False, js=False, compact=False, class_sort=True, write_to_file=True, ) # while alm-format can be read from the text-file without problems, # msa-format should be loaded first (once this is already provided), the # loss in speed won't matter much, since output of data is not a daily task # load templates template = template or template_path('msa2html.html') if template == 'js': template = template_path('msa2html.js.html') html = util.read_text_file(template) css = util.read_text_file(keywords['css'] or template_path('msa.css')) js = util.read_text_file(keywords['js'] or template_path('msa.js')) # treat the msa-object as a file and try to load the file if this is the # case if isinstance(msa, string_types): msa = read_msa(msa, **keywords) else: raise ValueError('[!] No filename specified.') # load dataset, etc. dataset = msa['dataset'] # calculate pid score, if it is not passed as argument if 'pid_score' not in keywords: pid_score = 0 count = 0 for i, seqA in enumerate(msa['alignment']): for j, seqB in enumerate(msa['alignment']): if i < j: pid_score += pid(seqA, seqB, mode=keywords['pid_mode']) count += 1 pid_score = int(100 * pid_score / count + 0.5) else: pid_score = keywords['pid_score'] infile = msa['infile'] seq_id = msa['seq_id'] # define the titles etc. if not shorttitle: shorttitle = 'SCA' # determine the length of the longest taxon taxl = max([len(t) for t in msa['taxa']]) # format css file css = css.replace('TAXON_LENGTH', str(taxl * 10)) out = '' tr = '<tr class="msa" unique="{1}" taxon={2} sequence={3}>{0}</tr>\n' td_taxon = '<td class="taxon">{0}</td>' perc = int(80 / len(msa['alignment'][0]) + 0.5) td_residue = '<td class="residue {1}">{0}</td>' td_swap = '<td class="residue swap {1}">{0}</td>' td_unaligned = '<td class="residue noalign {1}">{0}</td>' # check for swaps in the alignment if 'swaps' in msa: swaps = [] for s in msa['swaps']: swaps.extend(s) else: swaps = [] # check for local = ['*'] * len(msa['alignment'][0]) if 'local' in msa: local = ['.'] * len(msa['alignment'][0]) for i in msa['local']: local[i] = '*' # get two sorting schemas for the sequences if keywords['class_sort']: classes = [ tokens2class(ipa2tokens(seq), rcParams['asjp']) for seq in msa['seqs'] ] seqs = dict([ (a[1], b) for a, b in zip( sorted( zip(classes, msa['seqs']), key=lambda x: x[0] # list(zip(x[0],x[1])) ), range(1, len(msa['seqs']) + 1)) ]) else: seqs = dict(zip(sorted(msa['seqs']), range(1, len(msa['seqs']) + 1))) taxa = dict(zip(sorted(msa['taxa']), range(1, len(msa['taxa']) + 1))) # set up a list to store unique alignments alignments = [] # start iteration for i, taxon in enumerate(msa['taxa']): tmp = '' tmp += td_taxon.format(taxon) # append alignment to alignments alignment = ''.join(msa['alignment'][i]) sequence = msa['seqs'][i] if alignment in alignments: unique = 'false' else: unique = 'true' alignments += [alignment] for j, char in enumerate(msa['alignment'][i]): if char == '-': d = 'dolgo_GAP' c = '#bbbbbb' else: d = 'dolgo_' + token2class(char, rcParams['dolgo']) c = token2class(char, rcParams['_color']) # bad check for three classes named differently if d == 'dolgo__': d = 'dolgo_X' elif d == 'dolgo_1': d = 'dolgo_TONE' elif d == 'dolgo_0': d = 'dolgo_ERROR' if j in swaps: tmp += td_swap.format(char, d) elif local[j] != '*': tmp += td_unaligned.format(char, d) else: tmp += td_residue.format(char, d) out += tr.format(tmp, unique, taxa[taxon], seqs[sequence]) html = html.format( table=out, dataset=dataset, pid=pid_score, file=infile, sequence=seq_id, shorttitle=shorttitle, width=len(msa['alignment'][0]), table_width='{0}'.format(len(msa['alignment'][0]) * 50 + 8 * taxl), taxa=len(msa['alignment']), uniseqs=len(set(msa['seqs'])), css=css, js=js) if not filename: filename = rcParams['filename'] if not filename.endswith('.html'): filename = filename + '.html' if keywords['compact']: html = html.replace('\n', ' ') html = re.sub(r'\s+', r' ', html) html = html.replace('> ', '>') html = html.replace(' >', '>') if keywords['write_to_file']: # check, whether the outfile already exists util.write_text_file(filename, html) else: return html
def msa2html( msa, shorttitle='', filename='', template='', **keywords ): """ Convert files in ``msa``-format into colored ``html``-format. Parameters ---------- msa : dict A dictionary object that contains all the information of an MSA object. shorttitle : str Define the shorttitle of the ``html``-page. If no title is provided, the default title ``SCA`` will be used. filename : str (default="") Define the name of the output file. If no name is defined, the name of the input file will be taken as a default. template : str (default="") The path to the template file. If no name is defined, the basic template will be used. The basic template currently used can be found under ``lingpy/data/templates/msa2html.html``. Examples -------- Load the libary. >>> from lingpy import * Load an ``msq``-file from the test-sets. >>> msa = MSA('harry.msq') Align the data progressively and carry out a check for swapped sites. >>> msa.prog_align() >>> msa.swap_check() >>> print(msa) w o l - d e m o r t w a l - d e m a r - v - l a d i m i r - Save the data to the file ``harry.msa``. >>> msa.output('msa',filename='harry') Save the ``msa``-object as ``html``. >>> msa.output('html',filename='harry') Notes ----- The coloring of sound segments with respect to the sound class they belong to is based on the definitions given in the ``color`` :py:class:`~lingpy.data.model.Model`. It can easily be changed and adapted. See also -------- lingpy.convert.html.alm2html """ util.setdefaults( keywords, pid_mode=1, stress=rcParams['stress'], css=False, js=False, compact=False, class_sort=True, write_to_file=True, ) # while alm-format can be read from the text-file without problems, # msa-format should be loaded first (once this is already provided), the # loss in speed won't matter much, since output of data is not a daily task # load templates template = template or template_path('msa2html.html') if template == 'js': template = template_path('msa2html.js.html') html = util.read_text_file(template) css = util.read_text_file(keywords['css'] or template_path('msa.css')) js = util.read_text_file(keywords['js'] or template_path('msa.js')) # treat the msa-object as a file and try to load the file if this is the # case if isinstance(msa, string_types): msa = read_msa(msa, **keywords) else: raise ValueError('[!] No filename specified.') # load dataset, etc. dataset = msa['dataset'] # calculate pid score, if it is not passed as argument if 'pid_score' not in keywords: pid_score = 0 count = 0 for i, seqA in enumerate(msa['alignment']): for j, seqB in enumerate(msa['alignment']): if i < j: pid_score += pid(seqA, seqB, mode=keywords['pid_mode']) count += 1 pid_score = int(100 * pid_score / count + 0.5) else: pid_score = keywords['pid_score'] infile = msa['infile'] seq_id = msa['seq_id'] # define the titles etc. if not shorttitle: shorttitle = 'SCA' # determine the length of the longest taxon taxl = max([len(t) for t in msa['taxa']]) # format css file css = css.replace('TAXON_LENGTH', str(taxl * 10)) out = '' tr = '<tr class="msa" unique="{1}" taxon={2} sequence={3}>{0}</tr>\n' td_taxon = '<td class="taxon">{0}</td>' perc = int(80 / len(msa['alignment'][0]) + 0.5) td_residue = '<td class="residue {1}">{0}</td>' td_swap = '<td class="residue swap {1}">{0}</td>' td_unaligned = '<td class="residue noalign {1}">{0}</td>' # check for swaps in the alignment if 'swaps' in msa: swaps = [] for s in msa['swaps']: swaps.extend(s) else: swaps = [] # check for local = ['*'] * len(msa['alignment'][0]) if 'local' in msa: local = ['.'] * len(msa['alignment'][0]) for i in msa['local']: local[i] = '*' # get two sorting schemas for the sequences if keywords['class_sort']: classes = [tokens2class(ipa2tokens(seq), rcParams['asjp']) for seq in msa['seqs']] seqs = dict( [(a[1], b) for a, b in zip( sorted( zip(classes, msa['seqs']), key=lambda x: x[0] # list(zip(x[0],x[1])) ), range(1, len(msa['seqs']) + 1) )] ) else: seqs = dict(zip(sorted(msa['seqs']), range(1, len(msa['seqs']) + 1))) taxa = dict(zip(sorted(msa['taxa']), range(1, len(msa['taxa']) + 1))) # set up a list to store unique alignments alignments = [] # start iteration for i, taxon in enumerate(msa['taxa']): tmp = '' tmp += td_taxon.format(taxon) # append alignment to alignments alignment = ''.join(msa['alignment'][i]) sequence = msa['seqs'][i] if alignment in alignments: unique = 'false' else: unique = 'true' alignments += [alignment] for j, char in enumerate(msa['alignment'][i]): if char == '-': d = 'dolgo_GAP' c = '#bbbbbb' else: d = 'dolgo_' + token2class(char, rcParams['dolgo']) c = token2class(char, rcParams['_color']) # bad check for three classes named differently if d == 'dolgo__': d = 'dolgo_X' elif d == 'dolgo_1': d = 'dolgo_TONE' elif d == 'dolgo_0': d = 'dolgo_ERROR' if j in swaps: tmp += td_swap.format(char, d) elif local[j] != '*': tmp += td_unaligned.format(char, d) else: tmp += td_residue.format(char, d) out += tr.format(tmp, unique, taxa[taxon], seqs[sequence]) html = html.format( table=out, dataset=dataset, pid=pid_score, file=infile, sequence=seq_id, shorttitle=shorttitle, width=len(msa['alignment'][0]), table_width='{0}'.format(len(msa['alignment'][0]) * 50 + 8 * taxl), taxa=len(msa['alignment']), uniseqs=len(set(msa['seqs'])), css=css, js=js ) if not filename: filename = rcParams['filename'] if not filename.endswith('.html'): filename = filename + '.html' if keywords['compact']: html = html.replace('\n', ' ') html = re.sub(r'\s+', r' ', html) html = html.replace('> ', '>') html = html.replace(' >', '>') if keywords['write_to_file']: # check, whether the outfile already exists util.write_text_file(filename, html) else: return html
def load_dataset(input_path, source, input_type, output_path): print( " - Loading dataset and performing necessary conversion/tokenization.") if os.path.exists(output_path): print("Using existing wordlist file, nothing is generated.") return # No NA filter: the word form 'nan' should not be interpreted as NaN :p df = pd.read_csv(input_path, sep="\t", na_filter=False) # Depending on file format, remove and/or rename columns if source == "ielex" or source == "ielex-corr": # Rename columns df.rename(columns={ "Language": "DOCULECT", "Meaning": "CONCEPT", "Phonological Form": "IPA", "Cognate Class": "COGNATES_IELEX", "cc": "CONCEPT_COGNATES_IELEX" }, inplace=True) # Drop column with unused numbers df.drop(df.columns[[0]], axis=1, inplace=True) elif source == "northeuralex": df.rename(columns={ "Language_ID": "DOCULECT", "Concept_ID": "CONCEPT" }, inplace=True) tokens = [] if source == "ielex": # Perform IPA->ASJP conversion if source is ielex forms = [] for form_ipa in df["IPA"]: # ipa_to_asjp method accepts both space-separated (NELex) and # non-separated (IELex) if input_type == "asjp": form_asjp = utility.ipa_to_asjp(form_ipa) forms.append(form_asjp) tokens_form = list(form_asjp) elif input_type == "ipa": tokens_form = ipa2tokens(form_ipa) tokens_string = " ".join(tokens_form) tokens.append(tokens_string) if input_type == "asjp": df["ASJP"] = forms df["TOKENS"] = tokens elif source == "northeuralex": if input_type == "asjp": for form_asjp in df["ASJP"]: tokens_form = list(form_asjp) tokens_string = " ".join(tokens_form) tokens.append(tokens_string) df["TOKENS"] = tokens elif input_type == "ipa": df["TOKENS"] = df[input_type] # Filter out rows with XXX phonology field. df = df[df["IPA"] != "XXX"] # Filter out rows with empty phonology field df = df[df["IPA"] != ""] # Apply IELex cognate judgments to NElex # TODO: We can only do this if there is a publicly available intersection file # # if source == "northeuralex": # # Load intersection file # df_intersection = pd.read_csv(intersection_path, sep="\t") # # Per row, retrieve matching IELex judgment from intersection # cognates_intersection = [] # for _, row in df.iterrows(): # cog = df_intersection[((df_intersection["iso_code"] == row["DOCULECT"]) & (df_intersection["gloss_northeuralex"] == row["CONCEPT"]) & (df_intersection["ortho_northeuralex"] == row["COUNTERPART"]))]["cog_class_ielex"] # if cog.empty: # cog = None # else: # cog = cog.iloc[0] # cognates_intersection.append(cog) # df["COGNATES_IELEX"] = cognates_intersection # # Create CONCEPT_COGNATES_IELEX column with unique cognate classes across concepts # df["CONCEPT_COGNATES_IELEX"] = df["CONCEPT"] + "-" + df["COGNATES_IELEX"] print(f" - Writing corpus (with conversions) to {output_path}") df.to_csv(output_path, index_label="ID", sep="\t")