def test_token2class(): seq = 'tʰ ɔ x ˈth ə r A'.split(' ') assert token2class(seq[0], rc('dolgo')) == 'T' assert token2class(seq[3], 'dolgo') == 'T' assert token2class(seq[-1], 'dolgo') == '0'
def test_token2class(self): seq = 'tʰ ɔ x ˈth ə r A'.split(' ') assert token2class(seq[0], rc('dolgo')) == 'T' assert token2class(seq[3], 'dolgo') == 'T' assert token2class(seq[-1], 'dolgo') == '0' assert token2class('', 'dolgo') == '0'
def seg2class(segment, sca=False): if segment in ['#', '-']: return segment if sca: contexts.add((token2class(segment, 'sca'), segment)) return token2class(segment, 'sca') cl = token2class(segment, 'dolgo') return 'vowel' if cl == 'V' else 'cons'
def _make_package(args): # pragma: no cover """Prepare transcriptiondata from the transcription sources.""" from lingpy.sequence.sound_classes import token2class from lingpy.data import Model columns = ['LATEX', 'FEATURES', 'SOUND', 'IMAGE', 'COUNT', 'NOTE'] bipa = TranscriptionSystem('bipa') for src, rows in args.repos.iter_sources(type='td'): args.log.info('TranscriptionData {0} ...'.format(src['NAME'])) uritemplate = URITemplate( src['URITEMPLATE']) if src['URITEMPLATE'] else None out = [[ 'BIPA_GRAPHEME', 'CLTS_NAME', 'GENERATED', 'EXPLICIT', 'GRAPHEME', 'URL' ] + columns] graphemes = set() for row in rows: if row['GRAPHEME'] in graphemes: args.log.warn('skipping duplicate grapheme: {0}'.format( row['GRAPHEME'])) continue graphemes.add(row['GRAPHEME']) if not row['BIPA']: bipa_sound = bipa[row['GRAPHEME']] explicit = '' else: bipa_sound = bipa[row['BIPA']] explicit = '+' generated = '+' if bipa_sound.generated else '' if is_valid_sound(bipa_sound, bipa): bipa_grapheme = bipa_sound.s bipa_name = bipa_sound.name else: bipa_grapheme, bipa_name = '<NA>', '<NA>' url = uritemplate.expand( **row) if uritemplate else row.get('URL', '') out.append([ bipa_grapheme, bipa_name, generated, explicit, row['GRAPHEME'], url ] + [row.get(c, '') for c in columns]) found = len([o for o in out if o[0] != '<NA>']) args.log.info('... {0} of {1} graphemes found ({2:.0f}%)'.format( found, len(out), found / len(out) * 100)) with UnicodeWriter(pkg_path('transcriptiondata', '{0}.tsv'.format(src['NAME'])), delimiter='\t') as writer: writer.writerows(out) count = 0 with UnicodeWriter(pkg_path('soundclasses', 'lingpy.tsv'), delimiter='\t') as writer: writer.writerow(['CLTS_NAME', 'BIPA_GRAPHEME'] + SOUNDCLASS_SYSTEMS) for grapheme, sound in sorted(bipa.sounds.items()): if not sound.alias: writer.writerow([sound.name, grapheme] + [ token2class(grapheme, Model(cls)) for cls in SOUNDCLASS_SYSTEMS ]) count += 1 args.log.info('SoundClasses: {0} written to file.'.format(count))
def get_classes(alm): classes = [] residue = '<div class="residue {1}">{0}</div>' for j, char in enumerate(alm): if char == '-': d = 'dolgo_GAP' else: d = 'dolgo_' + token2class(char, rcParams['dolgo']) # bad check for three classes named differently if d == 'dolgo__': d = 'dolgo_X' elif d == 'dolgo_1': d = 'dolgo_TONE' elif d == 'dolgo_0': d = 'dolgo_ERROR' classes += [residue.format(char, d)] return ''.join(classes)
def get_classes(alm): classes = [] residue = '<div class="residue {1}">{0}</div>' for j, char in enumerate(alm): if char == '-': d = 'dolgo_GAP' else: d = 'dolgo_' + token2class(char, rcParams['dolgo']) # bad check for three classes named differently if d == 'dolgo__': d = 'dolgo_X' elif d == 'dolgo_1': d = 'dolgo_TONE' elif d == 'dolgo_0': d = 'dolgo_ERROR' classes += [residue.format(char, d)] return ''.join(classes)
for sA, dictB in sorted(all_changes.items(), key=lambda x: len(x[1]), reverse=True): for sB, items in dictB.items(): table += [[ sA, sB, len(items), len(set(items)), ', '.join([ '{0} > {1} ({2})'.format(a, b, items.count((a, b))) for a, b in set(items) ]) ]] table = sorted( table, key=lambda x: (x[2], x[3], token2class(x[0], 'cv'), token2class(x[1], 'cv'), token2class(x[0], 'dolgo'), token2class(x[1], 'dolgo'), token2class(x[0], 'sca'), token2class(x[1], 'sca')), reverse=True) print('[i] found {0} distinct changes in the data'.format(len(table))) with codecs.open("sound-change-frequencies.tsv", 'w', 'utf-8') as f: f.write( '\t'.join(['Source', 'Target', 'Frequency', 'RelFreq', 'Pairs']) + '\n') for line in table: f.write('\t'.join([str(x) for x in line]) + '\n') print('[i] most frequent 10 changes') print( tabulate( [line[:-1] for line in table][:10], tablefmt='pipe',
gs = gridspec.GridSpec(len(wordlists)+2, 1) all_cols = [] all_sounds = defaultdict(int) all_colors = {} for i, w in enumerate(wordlists): wl = Wordlist(w) colors = {} tmp = defaultdict(int) sylen = [] clen = [] for k in wl: dolgos = tokens2class(wl[k, 'tokens'], 'dolgo') for idx, t in zip(dolgos, wl[k, 'tokens']): if idx not in '+': tmp[idx] += 1 colors[idx] = token2class(t, color) all_cols += [(k, colors[idx])] all_sounds[idx] += 1 all_colors[idx] = colors[idx] sylen += [len(syllabify(' '.join(wl[k, 'tokens']), output='nested'))] clen += [len([x for x in dolgos if x not in '1V'])] print(w, sum(sylen) / len(sylen), sum(clen) / len(clen)) ax = plt.subplot(gs[i]) labels = [x for x, y in sorted(tmp.items(), key=lambda x: x[0])] ax.pie([y for x, y in sorted(tmp.items(), key=lambda x: x[0])], colors=[y for x, y in sorted(colors.items(), key=lambda x: x[0])], radius = 0.95, frame=True, shadow=True) ax.set_autoscale_on(False) plt.ylim(-1, 1) plt.xlim(-1, 1) plt.title(w.split('_')[2].split('-')[0])
color.converter["³¹"] = "Brown" color.converter["¹"] = "White" color.converter["²¹"] = "DarkOrange" color.converter["³³"] = "CornflowerBlue" color.converter["⁵³"] = "#c86496" color.converter["⁵¹"] = "cyan" _conv = {} _conv["A"] = "LightBlue" _conv["E"] = "Orange" _conv["I"] = "LightGreen" _conv["O"] = "white" _conv["U"] = "Crimson" _conv["Y"] = "LightYellow" for sound in color.converter: cls = token2class(sound, "sca") if cls in "AEIOUY": color.converter[sound] = _conv[cls] def contains(syllable, sound): _s = normalize("NFD", "".join(syllable)) if sound in _s: return True return False def is_aspirated(syllable): return contains(syllable, "ʰ")
def get_confidence(alms, scorer, ref='lexstatid', gap_weight=1): """ Function creates confidence scores for a given set of alignments. Parameters ---------- alms : :py:class`~lingpy.align.sca.Alignments` An *Alignments* object containing already aligned strings. scorer : :py:class:`~lingpy.algorithm._misc.ScoreDict` A *ScoreDict* object which gives similarity scores for all segments in the alignment. ref : str (default="lexstatid") The reference entry-type, referring to the cognate-set to be used for the analysis. """ # store all values for average scores values = [] # store all correspondences corrs = {} # store occurrences occs = {} for key, msa in alms.msa[ref].items(): # get basic stuff idxs = msa['ID'] taxa = msa['taxa'] concept = cgi.escape(alms[idxs[0], 'concept'], True) # get numerical representation of alignments if scorer: alignment = [class2tokens( alms[idxs[i], 'numbers'], msa['alignment'][i]) for i in range(len(idxs))] else: alignment = msa['alignment'] # create new array for confidence confidence_matrix = [] character_matrix = [] # iterate over each taxon for i, taxon in enumerate(taxa): idx = alms.taxa.index(taxon) + 1 # get the numerical sequence nums = alignment[i] # store confidences per line confidences = [] # store chars per line chars = [] # iterate over the sequence for j, num in enumerate(nums): col = [alm[j] for alm in alignment] score = 0 count = 0 # get the char if num != '-': charA = dotjoin(taxa[i], msa['alignment'][i][j], num.split('.')[2]) chars += [charA] try: occs[charA] += [concept] except: occs[charA] = [concept] else: chars += ['-'] for k, numB in enumerate(col): if k != i: if num == '-' and numB == '-': pass else: if numB != '-' and num != '-': # get the second char charB = dotjoin( taxa[k], msa['alignment'][k][j], numB.split('.')[2]) try: corrs[charA][charB] += 1 except: try: corrs[charA][charB] = 1 except: corrs[charA] = {charB: 1} gaps = False if num == '-' and numB != '-': numA = charstring(idx) gaps = True elif numB == '-' and num != '-': numB = charstring(alms.taxa.index(taxa[k])) numA = num gaps = True else: numA = num scoreA = scorer[numA, numB] scoreB = scorer[numB, numA] this_score = max(scoreA, scoreB) if not gaps: score += this_score count += 1 else: score += this_score * gap_weight count += gap_weight if count: score = score / count else: score = -25 confidences += [int(score + 0.5)] values += [int(score + 0.5)] confidence_matrix += [confidences] character_matrix += [chars] # append confidence matrix to alignments alms.msa[ref][key]['confidence'] = confidence_matrix alms.msa[ref][key]['_charmat'] = character_matrix # sort the values values = sorted(set(values + [1])) # make conversion to scale of 100 values converter = {} valsA = values[:values.index(1)] valsB = values[values.index(1):] stepA = 50 / (len(valsA) + 1) stepB = 75 / (len(valsB) + 1) for i, score in enumerate(valsA): # values[:values.index(0)): converter[score] = int((stepA * i) / 4 + 0.5) for i, score in enumerate(valsB): converter[score] = int(stepB * i + 0.5) + 50 # iterate over keys again for key, msa in alms.msa[ref].items(): # get basic stuff for i, line in enumerate(msa['confidence']): for j, cell in enumerate(line): alms.msa[ref][key]['confidence'][i][j] = converter[cell] jsond = {} for key, corr in corrs.items(): splits = [c.split('.') + [o] for c, o in corr.items()] sorts = sorted(splits, key=lambda x: (x[0], -x[3])) new_sorts = [] # check for rowspan spans = {} for a, b, c, d in sorts: if a in spans: if spans[a] < 3 and d > 1: spans[a] += 1 new_sorts += [[a, b, c, d]] else: if d > 1: spans[a] = 1 new_sorts += [[a, b, c, d]] bestis = [] old_lang = '' counter = 0 for a, b, c, d in new_sorts: new_lang = a if new_lang != old_lang: old_lang = new_lang tmp = '<tr class="display">' tmp += '<td class="display" rowspan={0}>'.format(spans[a]) tmp += a + '</td>' tmp += '<td class="display" onclick="show({0});"><span '.format( "'" + dotjoin(a, b, c) + "'") tmp += 'class="char {0}">' + b + '</span></td>' tmp += '<td class="display">' tmp += c + '</td>' tmp += '<td class="display">' + str(d) + '</td>' tmp += '<td class="display">' + str(len(occs[dotjoin(a, b, c)])) + '</td>' tmp += '</tr>' t = 'dolgo_' + token2class(b, rcParams['dolgo']) # bad check for three classes named differently if t == 'dolgo__': t = 'dolgo_X' elif t == 'dolgo_1': t = 'dolgo_TONE' elif t == 'dolgo_0': t = 'dolgo_ERROR' bestis += [tmp.format(t)] counter += 1 elif counter > 0: tmp = '<tr class="display">' tmp += '<td class="display" onclick="show({0});"><span '.format( "'" + dotjoin(a, b, c) + "'") tmp += 'class="char {0}">' + b + '</span></td>' tmp += '<td class="display">' + c + '</td>' tmp += '<td class="display">' + str(d) + '</td>' tmp += '<td class="display">' + str(len(occs[dotjoin(a, b, c)])) + '</td>' tmp += '</tr>' t = 'dolgo_' + token2class(b, rcParams['dolgo']) # bad check for three classes named differently if t == 'dolgo__': t = 'dolgo_X' elif t == 'dolgo_1': t = 'dolgo_TONE' elif t == 'dolgo_0': t = 'dolgo_ERROR' bestis += [tmp.format(t)] counter += 1 old_lang = new_lang else: old_lang = new_lang counter = 0 jsond[key] = [''.join(bestis), occs[key]] return jsond
def msa2tex(infile, template='', filename='', **keywords): """ Convert an MSA to a tabular representation which can easily be used in LaTeX documents. """ util.setdefaults(keywords, pid_mode=1) # while alm-format can be read from the text-file without problems, # msa-format should be loaded first (once this is already provided), the # loss in speed won't matter much, since output of data is not a daily task # load msa msa = read_msa(infile) ## load templates tex = util.read_text_file(template or template_path('msa.tex')) # calculate pid score, if it is not passed as argument if 'pid_score' not in keywords: pid_score = 0 count = 0 for i, seqA in enumerate(msa['alignment']): for j, seqB in enumerate(msa['alignment']): if i < j: pid_score += pid(seqA, seqB, mode=keywords['pid_mode']) count += 1 pid_score = int(100 * pid_score / count + 0.5) else: pid_score = keywords['pid_score'] dataset = msa['dataset'] infile = msa['infile'] seq_id = msa['seq_id'] # determine the length of the longest taxon taxl = max([len(t) for t in msa['taxa']]) height = len(msa['alignment']) width = len(msa['alignment'][0]) start = r'\tabular{l' + width * 'c' + '}\n' start += r'\bf\ttfamily Taxon & \multicolumn{' + str( width) + r'}{l}{\bf\ttfamily Alignment}\\' + '\n' # check for swaps in the alignment if 'swaps' in msa: swaps = [] for s in msa['swaps']: swaps.extend(s) else: swaps = [] body = start for i, taxon in enumerate(msa['taxa']): body += r'\ttfamily ' + taxon.replace('_', r'\_') for j, char in enumerate(msa['alignment'][i]): if char != '-': cls = token2class(char, rcParams['dolgo']) elif char == '-': cls = 'X' if char == '_': char = r'\#' if cls == '_': cls = '2' if j not in swaps: body += r'&\cellcolor{col' + cls + r'}' + char else: if char != '-': body += r'&\cellcolor{col' + cls + r'}\color{white}\bf ' + char else: body += r'&\cellcolor{col' + cls + r'}\bf ' + char body += r'\\' + '\n' body += r'&' + '&'.join([r'\color{white}XXX' for i in range(width)]) + r'\\' + '\n' body += r'\endtabular' + '\n' # create the parameters etc. w = 1.5 * width + taxl * 0.25 h = 0.5 * height + 1.0 tex = tex.replace('<+WIDTH+>', '{0:2f}'.format(w)) tex = tex.replace('<+HEIGHT+>', '{0:2f}'.format(h)) # create the rput stuff tex = tex.replace('<+NEWX+>', '{0:.2f}'.format(w / 2.0)) tex = tex.replace('<+NEWY+>', '{0:.2f}'.format((h - 0.5) / 2.0)) # insert the rest tex = tex.replace('<+CONTENT+>', body) # write to file if not filename: filename = 'lingpy-{0}' util.write_text_file(filename + '.tex', tex)
def simple_profile(wordlist, ref='ipa', semi_diacritics='hsʃ̢ɕʂʐʑʒw', merge_vowels=False, brackets=None, splitters='/,;~', merge_geminates=True, bad_word="<???>", bad_sound="<?>", clts=None, unknown_sound="!{0}"): """ Create an initial Orthography Profile using Lingpy's clean_string procedure. Parameters ---------- wordlist : ~lingpy.basic.wordlist.Wordlist A wordlist from which you want to derive an initial orthography profile. ref : str (default="ipa") The name of the reference column in which the words are stored. semi_diacritics : str Indicate characters which can occur both as "diacritics" (second part in a sound) or alone. merge_vowels : bool (default=True) Indicate whether consecutive vowels should be merged. brackets : dict A dictionary with opening brackets as key and closing brackets as values. Defaults to a pre-defined set of frequently occurring brackets. splitters : str The characters which force the automatic splitting of an entry. clts : dict (default=None) A dictionary(like) object that converts a given source sound into a potential target sound, using the get()-method of the dictionary. Normally, we think of a CLTS instance here (that is: a cross-linguistic transcription system as defined in the pyclts package). bad_word : str (default="«???»") Indicate how words that could not be parsed should be handled. Note that both "bad_word" and "bad_sound" are format-strings, so you can add formatting information here. bad_sound : str (default="«?»") Indicate how sounds that could not be converted to a sound class be handled. Note that both "bad_word" and "bad_sound" are format-strings, so you can add formatting information here. unknown_sound : str (default="!{0}") If with_clts is set to True, use this string to indicate that sounds are classified as "unknown sound" in the CLTS framework. Returns ------- profile : generator A generator of tuples (three items), indicating the segment, its frequency, the conversion to sound classes in the Dolgopolsky sound-class model, and the unicode-codepoints. """ clts = clts or {} nulls = set() bad_words = set() brackets = brackets or "([{『(₍⁽«)]})』⁾₎" profile = defaultdict(int) words = [wordlist[idx, ref] for idx in wordlist] for word in pb(words, desc='iterating over words'): if isinstance(word, list): word = ' '.join(word) cleaned_string = clean_string(word, semi_diacritics=semi_diacritics, merge_vowels=merge_vowels, brackets=None, ignore_brackets=False, split_entries=False, preparse=None, rules=None, merge_geminates=merge_geminates)[0] # retain whole word if there are splitters in the word if [x for x in cleaned_string if x in brackets + splitters]: profile[word] += 1 bad_words.add(word) else: for segment in cleaned_string.split(' '): profile[segment] += 1 for segment in [x for x in word if x not in cleaned_string]: profile[segment] += 1 nulls.add(segment) for s, f in pb(sorted(profile.items(), key=lambda x: x[1], reverse=True), desc='preparing profile'): sclass = token2class(s, 'dolgo') if s in bad_words: ipa = bad_word.format(s) elif sclass == '0' and s not in nulls: ipa = bad_sound.format(s) elif s in nulls: ipa = 'NULL' elif clts: sound = clts.get(s, False) if not sound: ipa = '!' + s else: ipa = text_type(sound) else: ipa = s yield s, ipa, text_type(f), codepoint(s)
for msa, vals in alm.msa[target].items(): langs = vals['taxa'] seqs = vals['alignment'] alm_len = len(seqs[0]) len_alms += alm_len #print alm_len for i, lang in enumerate(alm.cols): raxml_alm_str = "" if lang not in langs: alm_str = alm_len * '?' raxml_alm_str = list(alm_str) else: raxml_alm_str = [ token2class(x, 'sca') if x != '-' else '-' for x in seqs[langs.index(lang)] ] for ch in raxml_alm_str: if ch == '-': continue if ch not in uniq_chars: uniq_chars.append(ch) #raxml_alm_str = ' '.join([map_chars[uniq_chars.index(x)] if x != '-' else '-' for x in alm_str]) #print raxml_alm_str phylip[lang] += raxml_alm_str # phylip[lang] += alm_str print len(uniq_chars), " ALPHABET" print sorted(uniq_chars)
def get_confidence(alms, scorer, ref='lexstatid', gap_weight=1): """ Function creates confidence scores for a given set of alignments. Parameters ---------- alms : :py:class`~lingpy.align.sca.Alignments` An *Alignments* object containing already aligned strings. scorer : :py:class:`~lingpy.algorithm._misc.ScoreDict` A *ScoreDict* object which gives similarity scores for all segments in the alignment. ref : str (default="lexstatid") The reference entry-type, referring to the cognate-set to be used for the analysis. """ # store all values for average scores values = [] # store all correspondences corrs = {} # store occurrences occs = {} for key, msa in alms.msa[ref].items(): # get basic stuff idxs = msa['ID'] taxa = msa['taxa'] concept = cgi.escape(alms[idxs[0], 'concept'], True) # get numerical representation of alignments if scorer: alignment = [class2tokens( alms[idxs[i], 'numbers'], msa['alignment'][i]) for i in range(len(idxs))] else: alignment = msa['alignment'] # create new array for confidence confidence_matrix = [] character_matrix = [] # iterate over each taxon for i, taxon in enumerate(taxa): idx = alms.taxa.index(taxon) + 1 # get the numerical sequence nums = alignment[i] # store confidences per line confidences = [] # store chars per line chars = [] # iterate over the sequence for j, num in enumerate(nums): col = [alm[j] for alm in alignment] score = 0 count = 0 # get the char if num != '-': charA = dotjoin(taxa[i], msa['alignment'][i][j], num.split('.')[2]) chars += [charA] try: occs[charA] += [concept] except: occs[charA] = [concept] else: chars += ['-'] for k, numB in enumerate(col): if k != i: if num == '-' and numB == '-': pass else: if numB != '-' and num != '-': # get the second char charB = dotjoin( taxa[k], msa['alignment'][k][j], numB.split('.')[2]) try: corrs[charA][charB] += 1 except: try: corrs[charA][charB] = 1 except: corrs[charA] = {charB: 1} gaps = False if num == '-' and numB != '-': numA = charstring(idx) gaps = True elif numB == '-' and num != '-': numB = charstring(alms.taxa.index(taxa[k])) numA = num gaps = True else: numA = num scoreA = scorer[numA, numB] scoreB = scorer[numB, numA] this_score = max(scoreA, scoreB) if not gaps: score += this_score count += 1 else: score += this_score * gap_weight count += gap_weight if count: score = score / count else: score = -25 confidences += [int(score + 0.5)] values += [int(score + 0.5)] confidence_matrix += [confidences] character_matrix += [chars] # append confidence matrix to alignments alms.msa[ref][key]['confidence'] = confidence_matrix alms.msa[ref][key]['_charmat'] = character_matrix # sort the values values = sorted(set(values + [1])) # make conversion to scale of 100 values converter = {} valsA = values[:values.index(1)] valsB = values[values.index(1):] stepA = 50 / (len(valsA) + 1) stepB = 75 / (len(valsB) + 1) for i, score in enumerate(valsA): # values[:values.index(0)): converter[score] = int((stepA * i) / 4 + 0.5) for i, score in enumerate(valsB): converter[score] = int(stepB * i + 0.5) + 50 # iterate over keys again for key, msa in alms.msa[ref].items(): # get basic stuff for i, line in enumerate(msa['confidence']): for j, cell in enumerate(line): alms.msa[ref][key]['confidence'][i][j] = converter[cell] jsond = {} for key, corr in corrs.items(): splits = [c.split('.') + [o] for c, o in corr.items()] sorts = sorted(splits, key=lambda x: (x[0], -x[3])) new_sorts = [] # check for rowspan spans = {} for a, b, c, d in sorts: if a in spans: if spans[a] < 3 and d > 1: spans[a] += 1 new_sorts += [[a, b, c, d]] else: if d > 1: spans[a] = 1 new_sorts += [[a, b, c, d]] bestis = [] old_lang = '' counter = 0 for a, b, c, d in new_sorts: new_lang = a if new_lang != old_lang: old_lang = new_lang tmp = '<tr class="display">' tmp += '<td class="display" rowspan={0}>'.format(spans[a]) tmp += a + '</td>' tmp += '<td class="display" onclick="show({0});"><span '.format( "'" + dotjoin(a, b, c) + "'") tmp += 'class="char {0}">' + b + '</span></td>' tmp += '<td class="display">' tmp += c + '</td>' tmp += '<td class="display">' + str(d) + '</td>' tmp += '<td class="display">' + str(len(occs[dotjoin(a, b, c)])) + '</td>' tmp += '</tr>' t = 'dolgo_' + token2class(b, rcParams['dolgo']) # bad check for three classes named differently if t == 'dolgo__': t = 'dolgo_X' elif t == 'dolgo_1': t = 'dolgo_TONE' elif t == 'dolgo_0': t = 'dolgo_ERROR' bestis += [tmp.format(t)] counter += 1 elif counter > 0: tmp = '<tr class="display">' tmp += '<td class="display" onclick="show({0});"><span '.format( "'" + dotjoin(a, b, c) + "'") tmp += 'class="char {0}">' + b + '</span></td>' tmp += '<td class="display">' + c + '</td>' tmp += '<td class="display">' + str(d) + '</td>' tmp += '<td class="display">' + str(len(occs[dotjoin(a, b, c)])) + '</td>' tmp += '</tr>' t = 'dolgo_' + token2class(b, rcParams['dolgo']) # bad check for three classes named differently if t == 'dolgo__': t = 'dolgo_X' elif t == 'dolgo_1': t = 'dolgo_TONE' elif t == 'dolgo_0': t = 'dolgo_ERROR' bestis += [tmp.format(t)] counter += 1 old_lang = new_lang else: old_lang = new_lang counter = 0 jsond[key] = [''.join(bestis), occs[key]] return jsond
def msa2tex( infile, template='', filename='', **keywords ): """ Convert an MSA to a tabular representation which can easily be used in LaTeX documents. """ util.setdefaults(keywords, pid_mode=1) # while alm-format can be read from the text-file without problems, # msa-format should be loaded first (once this is already provided), the # loss in speed won't matter much, since output of data is not a daily task # load msa msa = read_msa(infile) ## load templates tex = util.read_text_file(template or template_path('msa.tex')) # calculate pid score, if it is not passed as argument if 'pid_score' not in keywords: pid_score = 0 count = 0 for i, seqA in enumerate(msa['alignment']): for j, seqB in enumerate(msa['alignment']): if i < j: pid_score += pid(seqA, seqB, mode=keywords['pid_mode']) count += 1 pid_score = int(100 * pid_score / count + 0.5) else: pid_score = keywords['pid_score'] dataset = msa['dataset'] infile = msa['infile'] seq_id = msa['seq_id'] # determine the length of the longest taxon taxl = max([len(t) for t in msa['taxa']]) height = len(msa['alignment']) width = len(msa['alignment'][0]) start = r'\tabular{l' + width * 'c' + '}\n' start += r'\bf\ttfamily Taxon & \multicolumn{' + str( width) + r'}{l}{\bf\ttfamily Alignment}\\' + '\n' # check for swaps in the alignment if 'swaps' in msa: swaps = [] for s in msa['swaps']: swaps.extend(s) else: swaps = [] body = start for i, taxon in enumerate(msa['taxa']): body += r'\ttfamily ' + taxon.replace('_', r'\_') for j, char in enumerate(msa['alignment'][i]): if char != '-': cls = token2class(char, rcParams['dolgo']) elif char == '-': cls = 'X' if char == '_': char = r'\#' if cls == '_': cls = '2' if j not in swaps: body += r'&\cellcolor{col' + cls + r'}' + char else: if char != '-': body += r'&\cellcolor{col' + cls + r'}\color{white}\bf ' + char else: body += r'&\cellcolor{col' + cls + r'}\bf ' + char body += r'\\' + '\n' body += r'&' + '&'.join([r'\color{white}XXX' for i in range(width)]) + r'\\' + '\n' body += r'\endtabular' + '\n' # create the parameters etc. w = 1.5 * width + taxl * 0.25 h = 0.5 * height + 1.0 tex = tex.replace('<+WIDTH+>', '{0:2f}'.format(w)) tex = tex.replace('<+HEIGHT+>', '{0:2f}'.format(h)) # create the rput stuff tex = tex.replace('<+NEWX+>', '{0:.2f}'.format(w / 2.0)) tex = tex.replace('<+NEWY+>', '{0:.2f}'.format((h - 0.5) / 2.0)) # insert the rest tex = tex.replace('<+CONTENT+>', body) # write to file if not filename: filename = 'lingpy-{0}' util.write_text_file(filename + '.tex', tex)
def alm2html( infile, title='', shorttitle='', filename='', colored=False, main_template='', table_template='', dataset='', confidence=False, **keywords ): """ Convert files in ``alm``-format into colored ``html``-format. Parameters ---------- title : str Define the title of the output file. If no title is provided, the default title ``LexStat - Automatic Cognate Judgments`` will be used. shorttitle : str Define the shorttitle of the ``html``-page. If no title is provided, the default title ``LexStat`` will be used. Notes ----- The coloring of sound segments with respect to the sound class they belong to is based on the definitions given in the ``color`` :py:class:`~lingpy.data.model.Model`. It can easily be changed and adapted. See also -------- lingpy.convert.html.msa2html lingpy.convert.html.msa2tex """ util.setdefaults(keywords, json="", labels={}) # open the infile if not os.path.exists(infile): infile = infile + '.alm' data = util.read_text_file(infile) # create the outfile if not filename: filename = rcParams['filename'] # read in the templates html = util.read_text_file(main_template or template_path('alm2html.html')) if not table_template: table_template = template_path( 'alm2html.table.js.html' if confidence else 'alm2html.table.html') table = util.read_text_file(table_template) css = util.read_text_file(template_path('alm.css')) js = util.read_text_file(template_path('alm.js')) # define a label function for the taxa label = lambda x: keywords['labels'][x] if x in keywords['labels'] else x # check for windows-compatibility data = data.replace(os.linesep, '\n')[:-1] # split the data into blocks blocks = data.split('\n\n') # retrieve the dataset dataset = dataset or blocks[0] # create the outstring tmp_str = '' for block in blocks[1:]: lines = block.split('\n') m = [l.split('\t') for l in lines] # create colordict for different colors dc = len(set([l[0] for l in m])) if colored: colors = {a: b for a, b in zip( sorted(set([int(l[0]) for l in m])), colorRange(dc, brightness=400), )} else: colors = [] white = True for i in sorted(set([abs(int(l[0])) for l in m])): if white: colors.append((i, 'white')) white = False else: colors.append((i, 'gray')) white = True colors = dict(colors) # get the basic item and its id iName = m[0][2] iID = m[0][3] # start writing the stuff to string tmp_str += table.format(NAME=iName, ID=iID) # define the basic string for the insertion bas = ' <tr class="{0}{2} taxon" taxon="{3}">\n{1}' for tracer, l in enumerate(m): # check whether the current line is a borrowing if int(l[0]) < 0: loan_line = ' loan' else: loan_line = '' # assign the cognate id tmp = ' <td>{0}</td>\n'.format(l[0]) tmp += ' <td>{0}</td>\n'.format(label(l[1].strip('.'))) # check alignments for confidence scores ipa_string = ''.join([cell.split('/')[0] for cell in l[4:]]).replace('-', '') tmp += ' <td>{0}</td>\n'.format(ipa_string) tmp += ' <td class="{0}">\n'.format(colors[abs(int(l[0]))]) tmp += ' <table class="{0}">\n'.format(colors[abs(int(l[0]))]) tmp += ' <tr>\n{0} </tr>\n </table>\n </td>\n </tr>\n' # check whether another entry follows that is also an alignment, # otherwise, there's no need to display a word as an alignment cognate_set = False if tracer < len(m) - 1: if abs(int(m[tracer + 1][0])) == abs(int(l[0])): cognate_set = True if tracer > 0: if abs(int(m[tracer - 1][0])) == abs(int(l[0])): cognate_set = True # fill out html for the cognate sets if cognate_set: alm = '' for char in l[4:]: # check for confidence scores if '/' in char: try: char, conf, num = char.split('/') conf = int(conf) except ValueError: print(char.split('/')) raise ValueError("Something is wrong with %s." % (char)) else: char, conf, rgb = char, (255, 255, 255), 0.0 if char == '-': d = 'dolgo_GAP' else: d = 'dolgo_' + token2class(char, rcParams['dolgo']) # bad check for three classes named differently if d == 'dolgo__': d = 'dolgo_X' elif d == 'dolgo_1': d = 'dolgo_TONE' elif d == 'dolgo_0': d = 'dolgo_ERROR' if confidence: alm += ' ' alm += '<td class="char {1}" confidence={0} '.format( conf, d ) alm += 'char="{0}" '.format(char) alm += 'onclick="' + "show('{0}')".format(num) + '" ' alm += 'num="{0}"'.format(num) alm += '>\n {0}\n </td>\n'.format(char) else: alm += ' ' alm += '<td class="char {0}">{1}</td>\n'.format(d, char) else: alm = ' ' alm += '<td class="{0}">--</td>\n'.format(colors[abs(int(l[0]))]) # format the alignment try: tmp = tmp.format(alm) except ValueError: raise ValueError("Unknown problem in matchin %s and %s." % (alm, tmp)) # check for last line, where a new line should be inserted (not the # fastest solution, but plotting is not a matter of time, and it # suffices it's current purpose if tracer < len(m) - 1: pass else: if confidence: tmp += ' </table>\n' tmp += ' <tr class="empty"><td colspan="4" class="empty">' tmp += '<hr class="empty" /></td></tr>\n' # format the whole string tmp_str += bas.format( colors[abs(int(l[0]))], tmp, loan_line, l[1] ) if not title: title = "LexStat - Automatic Cognate Judgments" if not shorttitle: shorttitle = "LexStat" # check for json-attribute if keywords['json']: keywords['json'] = 'var myjson = ' + json.dumps(keywords['json'], indent=1) html = html.format( shorttitle=shorttitle, title=title, table=tmp_str, dataset=dataset, javascript=js, css=css, **keywords ) util.write_text_file(filename + '.html', html) return
def msa2html( msa, shorttitle='', filename='', template='', **keywords ): """ Convert files in ``msa``-format into colored ``html``-format. Parameters ---------- msa : dict A dictionary object that contains all the information of an MSA object. shorttitle : str Define the shorttitle of the ``html``-page. If no title is provided, the default title ``SCA`` will be used. filename : str (default="") Define the name of the output file. If no name is defined, the name of the input file will be taken as a default. template : str (default="") The path to the template file. If no name is defined, the basic template will be used. The basic template currently used can be found under ``lingpy/data/templates/msa2html.html``. Examples -------- Load the libary. >>> from lingpy import * Load an ``msq``-file from the test-sets. >>> msa = MSA('harry.msq') Align the data progressively and carry out a check for swapped sites. >>> msa.prog_align() >>> msa.swap_check() >>> print(msa) w o l - d e m o r t w a l - d e m a r - v - l a d i m i r - Save the data to the file ``harry.msa``. >>> msa.output('msa',filename='harry') Save the ``msa``-object as ``html``. >>> msa.output('html',filename='harry') Notes ----- The coloring of sound segments with respect to the sound class they belong to is based on the definitions given in the ``color`` :py:class:`~lingpy.data.model.Model`. It can easily be changed and adapted. See also -------- lingpy.convert.html.alm2html """ util.setdefaults( keywords, pid_mode=1, stress=rcParams['stress'], css=False, js=False, compact=False, class_sort=True, write_to_file=True, ) # while alm-format can be read from the text-file without problems, # msa-format should be loaded first (once this is already provided), the # loss in speed won't matter much, since output of data is not a daily task # load templates template = template or template_path('msa2html.html') if template == 'js': template = template_path('msa2html.js.html') html = util.read_text_file(template) css = util.read_text_file(keywords['css'] or template_path('msa.css')) js = util.read_text_file(keywords['js'] or template_path('msa.js')) # treat the msa-object as a file and try to load the file if this is the # case if isinstance(msa, string_types): msa = read_msa(msa, **keywords) else: raise ValueError('[!] No filename specified.') # load dataset, etc. dataset = msa['dataset'] # calculate pid score, if it is not passed as argument if 'pid_score' not in keywords: pid_score = 0 count = 0 for i, seqA in enumerate(msa['alignment']): for j, seqB in enumerate(msa['alignment']): if i < j: pid_score += pid(seqA, seqB, mode=keywords['pid_mode']) count += 1 pid_score = int(100 * pid_score / count + 0.5) else: pid_score = keywords['pid_score'] infile = msa['infile'] seq_id = msa['seq_id'] # define the titles etc. if not shorttitle: shorttitle = 'SCA' # determine the length of the longest taxon taxl = max([len(t) for t in msa['taxa']]) # format css file css = css.replace('TAXON_LENGTH', str(taxl * 10)) out = '' tr = '<tr class="msa" unique="{1}" taxon={2} sequence={3}>{0}</tr>\n' td_taxon = '<td class="taxon">{0}</td>' perc = int(80 / len(msa['alignment'][0]) + 0.5) td_residue = '<td class="residue {1}">{0}</td>' td_swap = '<td class="residue swap {1}">{0}</td>' td_unaligned = '<td class="residue noalign {1}">{0}</td>' # check for swaps in the alignment if 'swaps' in msa: swaps = [] for s in msa['swaps']: swaps.extend(s) else: swaps = [] # check for local = ['*'] * len(msa['alignment'][0]) if 'local' in msa: local = ['.'] * len(msa['alignment'][0]) for i in msa['local']: local[i] = '*' # get two sorting schemas for the sequences if keywords['class_sort']: classes = [tokens2class(ipa2tokens(seq), rcParams['asjp']) for seq in msa['seqs']] seqs = dict( [(a[1], b) for a, b in zip( sorted( zip(classes, msa['seqs']), key=lambda x: x[0] # list(zip(x[0],x[1])) ), range(1, len(msa['seqs']) + 1) )] ) else: seqs = dict(zip(sorted(msa['seqs']), range(1, len(msa['seqs']) + 1))) taxa = dict(zip(sorted(msa['taxa']), range(1, len(msa['taxa']) + 1))) # set up a list to store unique alignments alignments = [] # start iteration for i, taxon in enumerate(msa['taxa']): tmp = '' tmp += td_taxon.format(taxon) # append alignment to alignments alignment = ''.join(msa['alignment'][i]) sequence = msa['seqs'][i] if alignment in alignments: unique = 'false' else: unique = 'true' alignments += [alignment] for j, char in enumerate(msa['alignment'][i]): if char == '-': d = 'dolgo_GAP' c = '#bbbbbb' else: d = 'dolgo_' + token2class(char, rcParams['dolgo']) c = token2class(char, rcParams['_color']) # bad check for three classes named differently if d == 'dolgo__': d = 'dolgo_X' elif d == 'dolgo_1': d = 'dolgo_TONE' elif d == 'dolgo_0': d = 'dolgo_ERROR' if j in swaps: tmp += td_swap.format(char, d) elif local[j] != '*': tmp += td_unaligned.format(char, d) else: tmp += td_residue.format(char, d) out += tr.format(tmp, unique, taxa[taxon], seqs[sequence]) html = html.format( table=out, dataset=dataset, pid=pid_score, file=infile, sequence=seq_id, shorttitle=shorttitle, width=len(msa['alignment'][0]), table_width='{0}'.format(len(msa['alignment'][0]) * 50 + 8 * taxl), taxa=len(msa['alignment']), uniseqs=len(set(msa['seqs'])), css=css, js=js ) if not filename: filename = rcParams['filename'] if not filename.endswith('.html'): filename = filename + '.html' if keywords['compact']: html = html.replace('\n', ' ') html = re.sub(r'\s+', r' ', html) html = html.replace('> ', '>') html = html.replace(' >', '>') if keywords['write_to_file']: # check, whether the outfile already exists util.write_text_file(filename, html) else: return html
def run(args): ds = Dataset() alms = Alignments(ds.dir.joinpath('workflow', 'D_Chen_aligned.tsv').as_posix(), ref='cogids', transcription='form') sounds = defaultdict(lambda: defaultdict(int)) for cogid, msa in alms.msa['cogids'].items(): for (i, tA), (j, tB) in combinations(enumerate(msa['taxa']), r=2): for soundA, soundB in zip(msa['alignment'][i], msa['alignment'][j]): soundA = soundA.split('/')[1] if '/' in soundA else soundA soundB = soundB.split('/')[1] if '/' in soundB else soundB sounds[soundA][soundB] += 1 sounds[soundB][soundA] += 1 #args.log.info('found {0} sounds in data'.format(len(sounds))) soundlist = [ s for s in sorted(sounds, key=lambda x: (token2class(x, 'cv', cldf=True), token2class(x, 'dolgo', cldf=True), token2class(x, 'sca', cldf=True), token2class(x, 'asjp')), reverse=True) if token2class(s, 'cv', cldf=True) in 'T' ] #['K', 'G', 'C', 'D', 'T']] matrix = [[0 for x in soundlist] for y in soundlist] # iterate over sounds and try to bin the values for i, soundA in enumerate(soundlist): targets = sounds[soundA] soundsB = [ s for s in sorted(targets.items(), key=lambda x: x[1], reverse=True) if s[0] in soundlist ] total = sum([targets[x[0]] for x in soundsB]) bins = [(a, int(round(b / total * 100, 0))) for a, b in soundsB] print(total, soundA, sum([x[1] for x in bins]), bins) for soundB, score in bins: j = soundlist.index(soundB) if i < j: matrix[i][j] = score # iterate over sounds and try to bin the values for i, soundA in enumerate(soundlist): targets = sounds[soundA] soundsB = [ s for s in sorted(targets.items(), key=lambda x: x[1], reverse=True) if s[0] in soundlist ] total = sum([targets[x[0]] for x in soundsB]) print(total, soundA, soundsB) bins = [(a, int(round(b / total * 100, 0))) for a, b in soundsB] for soundB, score in bins: j = soundlist.index(soundB) if i >= j: matrix[i][j] = score args.log.info('calculated the matrix') plt.imshow(matrix, cmap='jet', vmax=100) plt.title('Sound correspondence frequency across Hmong-Mien languages') cb = plt.colorbar() cb.set_label('Frequency') plt.xticks(range(0, len(soundlist)), soundlist, fontsize=3) plt.yticks(range(0, len(soundlist)), soundlist, fontsize=3) plt.savefig(ds.dir.joinpath('workflow', 'plots.pdf').as_posix())
def context_profile(wordlist, ref='ipa', col="doculect", semi_diacritics='hsʃ̢ɕʂʐʑʒw', merge_vowels=False, brackets=None, splitters='/,;~', merge_geminates=True, clts=False, bad_word="<???>", bad_sound="<?>", unknown_sound="!{0}", examples=2, max_entries=100): """ Create an advanced Orthography Profile with context and doculect information. Parameters ---------- wordlist : ~lingpy.basic.wordlist.Wordlist A wordlist from which you want to derive an initial orthography profile. ref : str (default="ipa") The name of the reference column in which the words are stored. col : str (default="doculect") Indicate in which column the information on the language variety is stored. semi_diacritics : str Indicate characters which can occur both as "diacritics" (second part in a sound) or alone. merge_vowels : bool (default=True) Indicate whether consecutive vowels should be merged. brackets : dict A dictionary with opening brackets as key and closing brackets as values. Defaults to a pre-defined set of frequently occurring brackets. splitters : str The characters which force the automatic splitting of an entry. clts : dict (default=None) A dictionary(like) object that converts a given source sound into a potential target sound, using the get()-method of the dictionary. Normally, we think of a CLTS instance here (that is: a cross-linguistic transcription system as defined in the pyclts package). bad_word : str (default="«???»") Indicate how words that could not be parsed should be handled. Note that both "bad_word" and "bad_sound" are format-strings, so you can add formatting information here. bad_sound : str (default="«?»") Indicate how sounds that could not be converted to a sound class be handled. Note that both "bad_word" and "bad_sound" are format-strings, so you can add formatting information here. unknown_sound : str (default="!{0}") If with_clts is set to True, use this string to indicate that sounds are classified as "unknown sound" in the CLTS framework. examples : int(default=2) Indicate the number of examples that should be printed out. Returns ------- profile : generator A generator of tuples (three items), indicating the segment, its frequency, the conversion to sound classes in the Dolgopolsky sound-class model, and the unicode-codepoints. """ clts_ = clts or {} nulls = set() bad_words = set() brackets = brackets or "([{『(₍⁽«)]})』⁾₎" profile = defaultdict(list) errors = set() for idx, word, language in pb(wordlist.iter_rows(ref, col), desc='iter words', total=len(wordlist)): log.info('processing {0}-{1}'.format(idx, word)) if isinstance(word, list): word = ' '.join(word) if word.strip(): try: cleaned_string = clean_string( word, semi_diacritics=semi_diacritics, merge_vowels=merge_vowels, brackets=None, ignore_brackets=False, split_entries=False, preparse=None, rules=None, merge_geminates=merge_geminates)[0].split(' ') # retain whole word if there are splitters in the word if [x for x in cleaned_string if x in brackets + splitters]: profile[word] += [(language, word)] bad_words.add(word) else: context_pre = ['^'] + (len(cleaned_string) - 1) * [''] context_post = (len(cleaned_string) - 1) * [''] + ['$'] for ctxA, ctxB, segment in zip(context_pre, context_post, cleaned_string): profile[ctxA + segment + ctxB] += [(language, word)] for segment in [ x for x in word if x not in ' '.join(cleaned_string) ]: profile[segment] += [(language, word)] nulls.add(segment) except: errors.add(idx) log.warn('problem parsing {0}'.format(word)) for s in '^$': yield s, 'NULL', '', '', '', '' for idx, (s, entries) in pb(enumerate( sorted(profile.items(), key=lambda x: len(x[1]), reverse=True)), desc='yielding entries', total=len(profile)): sclass = token2class(s.strip('^$'), 'dolgo') words, langs = [l[1] for l in entries ][:max_entries], [l[0] for l in entries][:max_entries] languages = ', '.join( sorted(set(langs), key=lambda x: langs.count(x), reverse=True)) frequency = str(len(langs)) codepoints = codepoint(s) examples_ = ', '.join( sorted(set(words), key=lambda x: words.count(x), reverse=True)[:examples]) if s in bad_words: ipa = bad_word.format(s) elif sclass == '0': ipa = bad_sound.format(s) elif s in nulls: ipa = 'NULL' elif clts_: sound = clts_.get(s.strip('^$'), False) if not sound: ipa = '!' + s.strip('^$') else: ipa = text_type(sound) else: ipa = s.strip('^$') yield s, ipa, examples_, languages, frequency, codepoints
def msa2html(msa, shorttitle='', filename='', template='', **keywords): """ Convert files in ``msa``-format into colored ``html``-format. Parameters ---------- msa : dict A dictionary object that contains all the information of an MSA object. shorttitle : str Define the shorttitle of the ``html``-page. If no title is provided, the default title ``SCA`` will be used. filename : str (default="") Define the name of the output file. If no name is defined, the name of the input file will be taken as a default. template : str (default="") The path to the template file. If no name is defined, the basic template will be used. The basic template currently used can be found under ``lingpy/data/templates/msa2html.html``. Examples -------- Load the libary. >>> from lingpy import * Load an ``msq``-file from the test-sets. >>> msa = MSA('harry.msq') Align the data progressively and carry out a check for swapped sites. >>> msa.prog_align() >>> msa.swap_check() >>> print(msa) w o l - d e m o r t w a l - d e m a r - v - l a d i m i r - Save the data to the file ``harry.msa``. >>> msa.output('msa',filename='harry') Save the ``msa``-object as ``html``. >>> msa.output('html',filename='harry') Notes ----- The coloring of sound segments with respect to the sound class they belong to is based on the definitions given in the ``color`` :py:class:`~lingpy.data.model.Model`. It can easily be changed and adapted. See also -------- lingpy.convert.html.alm2html """ util.setdefaults( keywords, pid_mode=1, stress=rcParams['stress'], css=False, js=False, compact=False, class_sort=True, write_to_file=True, ) # while alm-format can be read from the text-file without problems, # msa-format should be loaded first (once this is already provided), the # loss in speed won't matter much, since output of data is not a daily task # load templates template = template or template_path('msa2html.html') if template == 'js': template = template_path('msa2html.js.html') html = util.read_text_file(template) css = util.read_text_file(keywords['css'] or template_path('msa.css')) js = util.read_text_file(keywords['js'] or template_path('msa.js')) # treat the msa-object as a file and try to load the file if this is the # case if isinstance(msa, string_types): msa = read_msa(msa, **keywords) else: raise ValueError('[!] No filename specified.') # load dataset, etc. dataset = msa['dataset'] # calculate pid score, if it is not passed as argument if 'pid_score' not in keywords: pid_score = 0 count = 0 for i, seqA in enumerate(msa['alignment']): for j, seqB in enumerate(msa['alignment']): if i < j: pid_score += pid(seqA, seqB, mode=keywords['pid_mode']) count += 1 pid_score = int(100 * pid_score / count + 0.5) else: pid_score = keywords['pid_score'] infile = msa['infile'] seq_id = msa['seq_id'] # define the titles etc. if not shorttitle: shorttitle = 'SCA' # determine the length of the longest taxon taxl = max([len(t) for t in msa['taxa']]) # format css file css = css.replace('TAXON_LENGTH', str(taxl * 10)) out = '' tr = '<tr class="msa" unique="{1}" taxon={2} sequence={3}>{0}</tr>\n' td_taxon = '<td class="taxon">{0}</td>' perc = int(80 / len(msa['alignment'][0]) + 0.5) td_residue = '<td class="residue {1}">{0}</td>' td_swap = '<td class="residue swap {1}">{0}</td>' td_unaligned = '<td class="residue noalign {1}">{0}</td>' # check for swaps in the alignment if 'swaps' in msa: swaps = [] for s in msa['swaps']: swaps.extend(s) else: swaps = [] # check for local = ['*'] * len(msa['alignment'][0]) if 'local' in msa: local = ['.'] * len(msa['alignment'][0]) for i in msa['local']: local[i] = '*' # get two sorting schemas for the sequences if keywords['class_sort']: classes = [ tokens2class(ipa2tokens(seq), rcParams['asjp']) for seq in msa['seqs'] ] seqs = dict([ (a[1], b) for a, b in zip( sorted( zip(classes, msa['seqs']), key=lambda x: x[0] # list(zip(x[0],x[1])) ), range(1, len(msa['seqs']) + 1)) ]) else: seqs = dict(zip(sorted(msa['seqs']), range(1, len(msa['seqs']) + 1))) taxa = dict(zip(sorted(msa['taxa']), range(1, len(msa['taxa']) + 1))) # set up a list to store unique alignments alignments = [] # start iteration for i, taxon in enumerate(msa['taxa']): tmp = '' tmp += td_taxon.format(taxon) # append alignment to alignments alignment = ''.join(msa['alignment'][i]) sequence = msa['seqs'][i] if alignment in alignments: unique = 'false' else: unique = 'true' alignments += [alignment] for j, char in enumerate(msa['alignment'][i]): if char == '-': d = 'dolgo_GAP' c = '#bbbbbb' else: d = 'dolgo_' + token2class(char, rcParams['dolgo']) c = token2class(char, rcParams['_color']) # bad check for three classes named differently if d == 'dolgo__': d = 'dolgo_X' elif d == 'dolgo_1': d = 'dolgo_TONE' elif d == 'dolgo_0': d = 'dolgo_ERROR' if j in swaps: tmp += td_swap.format(char, d) elif local[j] != '*': tmp += td_unaligned.format(char, d) else: tmp += td_residue.format(char, d) out += tr.format(tmp, unique, taxa[taxon], seqs[sequence]) html = html.format( table=out, dataset=dataset, pid=pid_score, file=infile, sequence=seq_id, shorttitle=shorttitle, width=len(msa['alignment'][0]), table_width='{0}'.format(len(msa['alignment'][0]) * 50 + 8 * taxl), taxa=len(msa['alignment']), uniseqs=len(set(msa['seqs'])), css=css, js=js) if not filename: filename = rcParams['filename'] if not filename.endswith('.html'): filename = filename + '.html' if keywords['compact']: html = html.replace('\n', ' ') html = re.sub(r'\s+', r' ', html) html = html.replace('> ', '>') html = html.replace(' >', '>') if keywords['write_to_file']: # check, whether the outfile already exists util.write_text_file(filename, html) else: return html
def alm2html(infile, title='', shorttitle='', filename='', colored=False, main_template='', table_template='', dataset='', confidence=False, **keywords): """ Convert files in ``alm``-format into colored ``html``-format. Parameters ---------- title : str Define the title of the output file. If no title is provided, the default title ``LexStat - Automatic Cognate Judgments`` will be used. shorttitle : str Define the shorttitle of the ``html``-page. If no title is provided, the default title ``LexStat`` will be used. Notes ----- The coloring of sound segments with respect to the sound class they belong to is based on the definitions given in the ``color`` :py:class:`~lingpy.data.model.Model`. It can easily be changed and adapted. See also -------- lingpy.convert.html.msa2html lingpy.convert.html.msa2tex """ util.setdefaults(keywords, json="", labels={}) # open the infile if not os.path.exists(infile): infile = infile + '.alm' data = util.read_text_file(infile) # create the outfile if not filename: filename = rcParams['filename'] # read in the templates html = util.read_text_file(main_template or template_path('alm2html.html')) if not table_template: table_template = template_path( 'alm2html.table.js.html' if confidence else 'alm2html.table.html') table = util.read_text_file(table_template) css = util.read_text_file(template_path('alm.css')) js = util.read_text_file(template_path('alm.js')) # define a label function for the taxa label = lambda x: keywords['labels'][x] if x in keywords['labels'] else x # check for windows-compatibility data = data.replace(os.linesep, '\n')[:-1] # split the data into blocks blocks = data.split('\n\n') # retrieve the dataset dataset = dataset or blocks[0] # create the outstring tmp_str = '' for block in blocks[1:]: lines = block.split('\n') m = [l.split('\t') for l in lines] # create colordict for different colors dc = len(set([l[0] for l in m])) if colored: colors = { a: b for a, b in zip( sorted(set([int(l[0]) for l in m])), colorRange(dc, brightness=400), ) } else: colors = [] white = True for i in sorted(set([abs(int(l[0])) for l in m])): if white: colors.append((i, 'white')) white = False else: colors.append((i, 'gray')) white = True colors = dict(colors) # get the basic item and its id iName = m[0][2] iID = m[0][3] # start writing the stuff to string tmp_str += table.format(NAME=iName, ID=iID) # define the basic string for the insertion bas = ' <tr class="{0}{2} taxon" taxon="{3}">\n{1}' for tracer, l in enumerate(m): # check whether the current line is a borrowing if int(l[0]) < 0: loan_line = ' loan' else: loan_line = '' # assign the cognate id tmp = ' <td>{0}</td>\n'.format(l[0]) tmp += ' <td>{0}</td>\n'.format(label(l[1].strip('.'))) # check alignments for confidence scores ipa_string = ''.join([cell.split('/')[0] for cell in l[4:]]).replace('-', '') tmp += ' <td>{0}</td>\n'.format(ipa_string) tmp += ' <td class="{0}">\n'.format(colors[abs(int(l[0]))]) tmp += ' <table class="{0}">\n'.format(colors[abs(int(l[0]))]) tmp += ' <tr>\n{0} </tr>\n </table>\n </td>\n </tr>\n' # check whether another entry follows that is also an alignment, # otherwise, there's no need to display a word as an alignment cognate_set = False if tracer < len(m) - 1: if abs(int(m[tracer + 1][0])) == abs(int(l[0])): cognate_set = True if tracer > 0: if abs(int(m[tracer - 1][0])) == abs(int(l[0])): cognate_set = True # fill out html for the cognate sets if cognate_set: alm = '' for char in l[4:]: # check for confidence scores if '/' in char: try: char, conf, num = char.split('/') conf = int(conf) except ValueError: print(char.split('/')) raise ValueError("Something is wrong with %s." % (char)) else: char, conf, rgb = char, (255, 255, 255), 0.0 if char == '-': d = 'dolgo_GAP' else: d = 'dolgo_' + token2class(char, rcParams['dolgo']) # bad check for three classes named differently if d == 'dolgo__': d = 'dolgo_X' elif d == 'dolgo_1': d = 'dolgo_TONE' elif d == 'dolgo_0': d = 'dolgo_ERROR' if confidence: alm += ' ' alm += '<td class="char {1}" confidence={0} '.format( conf, d) alm += 'char="{0}" '.format(char) alm += 'onclick="' + "show('{0}')".format(num) + '" ' alm += 'num="{0}"'.format(num) alm += '>\n {0}\n </td>\n'.format(char) else: alm += ' ' alm += '<td class="char {0}">{1}</td>\n'.format( d, char) else: alm = ' ' alm += '<td class="{0}">--</td>\n'.format(colors[abs(int( l[0]))]) # format the alignment try: tmp = tmp.format(alm) except ValueError: raise ValueError("Unknown problem in matchin %s and %s." % (alm, tmp)) # check for last line, where a new line should be inserted (not the # fastest solution, but plotting is not a matter of time, and it # suffices it's current purpose if tracer < len(m) - 1: pass else: if confidence: tmp += ' </table>\n' tmp += ' <tr class="empty"><td colspan="4" class="empty">' tmp += '<hr class="empty" /></td></tr>\n' # format the whole string tmp_str += bas.format(colors[abs(int(l[0]))], tmp, loan_line, l[1]) if not title: title = "LexStat - Automatic Cognate Judgments" if not shorttitle: shorttitle = "LexStat" # check for json-attribute if keywords['json']: keywords['json'] = 'var myjson = ' + json.dumps(keywords['json'], indent=1) html = html.format(shorttitle=shorttitle, title=title, table=tmp_str, dataset=dataset, javascript=js, css=css, **keywords) util.write_text_file(filename + '.html', html) return
color.converter['³¹'] = 'Brown' color.converter['¹'] = 'White' color.converter['²¹'] = 'DarkOrange' color.converter['³³'] = 'CornflowerBlue' color.converter['⁵³'] = '#c86496' color.converter['⁵¹'] = 'cyan' _conv = {} _conv['A'] = 'LightBlue' _conv['E'] = 'Orange' _conv['I'] = 'LightGreen' _conv['O'] = 'white' _conv['U'] = 'Crimson' _conv['Y'] = 'LightYellow' for sound in color.converter: cls = token2class(sound, 'sca') if cls in 'AEIOUY': color.converter[sound] = _conv[cls] def contains(syllable, sound): _s = normalize('NFD', ''.join(syllable)) if sound in _s: return True return False def is_aspirated(syllable): return contains(syllable, 'ʰ')