def check_strict_cognates( wordlist, ref='crossids', segments='tokens' ): """Check if cognates are really strict.""" fails, errors = [], 0 etd = wordlist.get_etymdict(ref=ref) for cogid in etd: for vals in etd[cogid]: if vals: if not str(cogid).isdigit(): fails += vals else: alms = [] for idx in vals: try: tokens = wordlist[idx, segments].n[wordlist[idx, ref].index(cogid)] alms += [(idx, tokens)] except: fails += [idx] for idx, tokens in alms[1:]: if str(tokens) != str(alms[0][1]): fails += [idx] errors += 1 log.warning('{0} | {1} | {2} | {3:15} | {4:15}'.format( errors, idx, alms[0][0], str(tokens), str(alms[0][1]))) return fails
def check_strict_cognates(wordlist, ref='crossids', segments='tokens'): """Check if cognates are really strict.""" fails, errors = [], 0 etd = wordlist.get_etymdict(ref=ref) for cogid in etd: for vals in etd[cogid]: if vals: if not str(cogid).isdigit(): fails += vals else: alms = [] for idx in vals: try: tokens = wordlist[idx, segments].n[wordlist[ idx, ref].index(cogid)] alms += [(idx, tokens)] except: fails += [idx] for idx, tokens in alms[1:]: if str(tokens) != str(alms[0][1]): fails += [idx] errors += 1 log.warning( '{0} | {1} | {2} | {3:15} | {4:15}'.format( errors, idx, alms[0][0], str(tokens), str(alms[0][1]))) return fails
def check_sequence_length( wordlist, entities=['tokens', 'crossids', 'morphemes', 'structure'], dimensions=[2, 1, 2, 1] ): """Function checks for identical sequence length in different columns. """ fails, errors = [], 0 for (eA, dA), (eB, dB) in combinations(zip(entities, dimensions), r=2): for idx in wordlist: if not check_length( wordlist[idx, eA], wordlist[idx, eB], dA, dB ): errors += 1 log.warning( '{0} | {1} | {2} | {3} | {4} | {5}'.format( errors, idx, eA, eB, wordlist[idx, eA], wordlist[idx, eB] ) ) fails += [(idx, eA, eB)] return fails
def test_convenience(): info('m') warning('m') debug('m') error('m') deprecated('o', 'n') missing_module('m') file_written('f')
def check_cognates(wordlist, ref='crossids'): """Function checks for internal consistency of partial cognates.""" fails = [] for idx, cogids in wordlist.iter_rows(ref): if len(set(cogids)) != len(cogids): log.warning('duplicates in {0}'.format(cogids)) fails += [idx] return fails
def _get_brackets(brackets): out = defaultdict(str) for b in brackets: out[b] = unicodedata.lookup(unicodedata.name(b).replace('LEFT', 'RIGHT')) if b == out[b]: log.warning('lingpy.sequence.sound_classes.get_brackets' + \ 'Item «{0}» does not have a counterpart!'.format(b)) return out
def string2html( taxon, string, swaps=[], tax_len=None ): """ Function converts an (aligned) string into colored html-format. @deprecated """ # determine the length of the string if not tax_len: tax_len = len(taxon) # set the tr-line tr = '<tr class="msa">\n{0}\n</tr>' # set the td_taxon-line td_taxon = '<td class="taxon" width="' + str(15 * tax_len) + '">{0}</td>\n' # get the percentage scaling factor perc = int(80 / len(string) + 0.5) # get vals for residue and swaps td_residue = '<td class="residue" width="50" align="center" bgcolor="{1}">' + \ '<font color="{2}">{0}</font></td>\n' td_swap = '<td class="residue swap" style="border:solid 3px black" width="50"' + \ 'align="center" bgcolor="{1}"><font color="{2}">{0}</font></td>\n' # start with filling the taxon out = '' out += td_taxon.format(taxon) # go on with the colors for i, char in enumerate(string): try: c = rcParams['_color'][char] fg = '#000000' except: try: c = rcParams['_color'][char[0]] fg = '#000000' except KeyError: log.warning("Unknown character '" + char + "', press ANY key to continue. ") c = '#ffffff' fg = '#eb3410' if i in swaps: out += td_swap.format(char, c, fg) else: out += td_residue.format(char, c, fg) return out
def string2html(taxon, string, swaps=[], tax_len=None): """ Function converts an (aligned) string into colored html-format. @deprecated """ # determine the length of the string if not tax_len: tax_len = len(taxon) # set the tr-line tr = '<tr class="msa">\n{0}\n</tr>' # set the td_taxon-line td_taxon = '<td class="taxon" width="' + str(15 * tax_len) + '">{0}</td>\n' # get the percentage scaling factor perc = int(80 / len(string) + 0.5) # get vals for residue and swaps td_residue = '<td class="residue" width="50" align="center" bgcolor="{1}">' + \ '<font color="{2}">{0}</font></td>\n' td_swap = '<td class="residue swap" style="border:solid 3px black" width="50"' + \ 'align="center" bgcolor="{1}"><font color="{2}">{0}</font></td>\n' # start with filling the taxon out = '' out += td_taxon.format(taxon) # go on with the colors for i, char in enumerate(string): try: c = rcParams['_color'][char] fg = '#000000' except: try: c = rcParams['_color'][char[0]] fg = '#000000' except KeyError: log.warning("Unknown character '" + char + "', press ANY key to continue. ") c = '#ffffff' fg = '#eb3410' if i in swaps: out += td_swap.format(char, c, fg) else: out += td_residue.format(char, c, fg) return out
def tokens2html( string, swaps=[], tax_len=None, ): """ Function converts an (aligned) string into colored html-format. Notes ----- This function is currently not used by any other program. So it might be useful to just deprecate it. @deprecated """ # set the tr-line tr = '<tr class="msa">\n{0}\n</tr>' # get the percentage scaling factor perc = int(80 / len(string) + 0.5) # get vals for residue and swaps td_residue = '<td class="residue" width="50" align="center" bgcolor="{1}">' + \ '<font color="{2}">{0}</font></td>\n' td_swap = '<td class="residue swap" style="border:solid 3px black" width="50"' + \ 'align="center" bgcolor="{1}"><font color="{2}">{0}</font></td>\n' # start with filling the taxon out = '<table>' # go on with the colors for i, char in enumerate(string): try: c = rcParams['_color'][char] fg = '#000000' except: try: c = rcParams['_color'][char[0]] fg = '#000000' except KeyError: log.warning("Unknown character '" + char + "', press ANY key to continue. ") c = '#ffffff' fg = '#eb3410' if i in swaps: out += td_swap.format(char, c, fg) else: out += td_residue.format(char, c, fg) return out + '</table>'
def check_sequence_length( wordlist, entities=['tokens', 'crossids', 'morphemes', 'structure'], dimensions=[2, 1, 2, 1]): """Function checks for identical sequence length in different columns. """ fails, errors = [], 0 for (eA, dA), (eB, dB) in combinations(zip(entities, dimensions), r=2): for idx in wordlist: if not check_length(wordlist[idx, eA], wordlist[idx, eB], dA, dB): errors += 1 log.warning('{0} | {1} | {2} | {3} | {4} | {5}'.format( errors, idx, eA, eB, wordlist[idx, eA], wordlist[idx, eB])) fails += [idx] return fails
def npoint_ap(scores, cognates, reverse=False): """ Calculate the n-point average precision. Parameters ---------- scores : list The scores of your algorithm for pairwise string comparison. cognates : list The cognate codings of the word pairs you compared. 1 indicates that the pair is cognate, 0 indicates that it is not cognate. reverse : bool (default=False) The order of your ranking mechanism. If your algorithm yields high scores for words which are probably cognate, and low scores for non-cognate words, you should set this keyword to "True". Notes ----- This follows the description in :evobib:`Kondrak2002`. The n-point average precision is useful to compare the discriminative force of different algorithms for string similarity, or to train the parameters of a given algorithm. Examples -------- >>> scores = [1, 2, 3, 4, 5] >>> cognates = [1, 1, 1, 0, 0] >>> from lingpy.evaluate.acd import npoint_ap >>> npoint_ap(scores, cognates) 1.0 """ p = 0.0 cognate_count = 0 for k, (score, cognate) in enumerate( sorted(zip(scores, cognates), key=lambda x: x[0], reverse=reverse)): if cognate == 1: cognate_count += 1 p += cognate_count / (k + 1.0) try: return p / cognates.count(1) except ZeroDivisionError: log.warning( "Encountered Zero Division in npoint_ap, your data seems to contain no cognates." ) return 0
def wl2multistate(wordlist, ref, missing): """ Function converts a wordlist to multistate format (compatible with PAUP). """ # convert the data to a multistate matrix # get etymological dictionary wordlist.get_etymdict(ref=ref) # define chars, we only have a limited set, unfortunately chars = ascii_letters + digits # iterate over all cognate sets and assign the chars matrix = [] for c in wordlist.concepts: taxon_to_cognate_set = wordlist.get_dict(concept=c, entry=ref) distinct_states = set() for taxon in wordlist.taxa: distinct_states.update(taxon_to_cognate_set.get(taxon, [0])) # make converter if len(distinct_states) > len(chars): # pragma: no cover # FIXME: This shouldn't just be a warning, because we # will get a KeyError # down below, since zip just returns a list of length len(chars)! log.warning('more distinct states than available characters!') char_map = dict(zip(sorted(distinct_states), chars)) char_map['-'] = '-' line = [] for taxon in wordlist.taxa: states = set(taxon_to_cognate_set.get(taxon, ['-'])) # exclude the case len(taxon_to_cognate_set[taxon]) == 0 if len(states) == 1: line.append(char_map[states.pop()]) elif not states: line.append(missing) else: line.append('({0})'.format( "".join([char_map[x] for x in sorted(states)]))) matrix.append(line) return misc.transpose(matrix)
def wl2multistate(wordlist, ref, missing): """ Function converts a wordlist to multistate format (compatible with PAUP). """ # convert the data to a multistate matrix # get etymological dictionary wordlist.get_etymdict(ref=ref) # define chars, we only have a limited set, unfortunately chars = ascii_letters + digits # iterate over all cognate sets and assign the chars matrix = [] for c in wordlist.concepts: taxon_to_cognate_set = wordlist.get_dict(concept=c, entry=ref) distinct_states = set() for taxon in wordlist.taxa: distinct_states.update(taxon_to_cognate_set.get(taxon, [0])) # make converter if len(distinct_states) > len(chars): # pragma: no cover # FIXME: This shouldn't just be a warning, because we # will get a KeyError # down below, since zip just returns a list of length len(chars)! log.warning('more distinct states than available characters!') char_map = dict(zip(sorted(distinct_states), chars)) char_map['-'] = '-' line = [] for taxon in wordlist.taxa: states = set(taxon_to_cognate_set.get(taxon, ['-'])) # exclude the case len(taxon_to_cognate_set[taxon]) == 0 if len(states) == 1: line.append(char_map[states.pop()]) elif not states: line.append(missing) else: line.append('({0})'.format("".join( [char_map[x] for x in sorted(states)]))) matrix.append(line) return misc.transpose(matrix)
def psa2html(infile, **kw): """ Function converts a PSA-file into colored html-format. """ util.setdefaults( kw, template=False, css=False, comment='#', filename=infile[:-4]+'.html', compact=True) template = util.read_text_file(kw['template'] or template_path('psa.html')) css = util.read_text_file(kw['css'] or template_path('psa.css')) data = [] for line in util.read_text_file(infile, lines=True): if not line.startswith(kw['comment']): data.append(line) seq_ids = [] pairs = [] taxa = [] alignments = [] del data[0] i = 0 while i <= len(data) - 3: try: seq_ids.append(data[i]) datA = data[i + 1].split('\t') datB = data[i + 2].split('\t') taxonA = datA[0].strip('.') taxonB = datB[0].strip('.') almA = datA[1:] almB = datB[1:] taxa.append((taxonA, taxonB)) pairs.append( ( '.'.join([k for k in almA if k != '-']), '.'.join([k for k in almB if k != '-']) ) ) alignments.append( ( [str(a) for a in almA], [str(b) for b in almB], 0) ) assert len(alignments[-1][0]) == len(alignments[-1][1]) i += 4 except AssertionError: log.warning("Line {0} of the data is probably miscoded.".format(i + 1)) i += 1 def get_classes(alm): classes = [] residue = '<div class="residue {1}">{0}</div>' for j, char in enumerate(alm): if char == '-': d = 'dolgo_GAP' else: d = 'dolgo_' + token2class(char, rcParams['dolgo']) # bad check for three classes named differently if d == 'dolgo__': d = 'dolgo_X' elif d == 'dolgo_1': d = 'dolgo_TONE' elif d == 'dolgo_0': d = 'dolgo_ERROR' classes += [residue.format(char, d)] return ''.join(classes) out = '<table>\n' # codecs.open(kw['filename'], 'w', 'utf-8') for i, (a, b, c) in enumerate(alignments): clsA = get_classes(a) clsB = get_classes(b) ids = int(100 * pid(a, b) + 0.5) out += '<tr class="head">' out += '<td colspan=2 class="head"><b>Alignment {0}:</b> <i>{1}</i>, PID: {2}</td></tr>'.format( i + 1, seq_ids[i], ids ) out += '<tr class="psa">' out += '<td class="taxon">{0}</td>'.format(taxa[i][0]) out += '<td class="psa">{0}</td>'.format(clsA) out += '</tr>' out += '<tr class="psa">' out += '<td class="taxon">{0}</td>'.format(taxa[i][1]) out += '<td class="psa">{0}</td>'.format(clsB) out += '</tr>' out += '<tr><td colspan=2></td></tr>' out += '</table>' html = template.format(alignments=out, css=css) if kw['compact']: html = html.replace('\n', ' ') html = re.sub(r'\s+', r' ', html) html = html.replace('> ', '>') html = html.replace(' >', '>') util.write_text_file(kw['filename'], html)
def read_qlc(infile, comment='#'): """ Simple function that loads qlc-format into a dictionary. Parameters ---------- infile : str The name of the input file. comment : str (default="#") The comment character. If a line starts with this character, it will be ignored. Returns ------- d : dict A dictionary with integer keys corresponding to the order of the lines of the input file. The header is given 0 as a specific key. """ lines = read_text_file(infile, lines=True, normalize="NFC") data, meta, dtype = [], {}, False while lines: line = lines.pop(0) if line.startswith(comment) or not line: continue if line.startswith('@'): key, value = [s.strip() for s in line[1:].split(':', 1)] if key == 'tree': meta["tree"] = cg.LoadTree(treestring=value) elif key == 'json': for j1, j2 in json.loads(value).items(): meta[j1] = j2 else: if key not in meta: meta[key] = value else: if isinstance(meta[key], list): meta[key].append(value) else: log.warning( "Key '{0}' in input file is not unique! Use JSON-format for " "these datatypes!".format(key)) meta[key] = [meta[key]] + [value] # line starts with complex stuff elif line.startswith('<'): tmp = line[1:line.index('>')] # check for specific keywords if ' ' in tmp: dtype = tmp.split(' ')[0] keys = {k: v[1:-1] for k, v in [key.split('=') for key in tmp.split(' ')[1:]]} else: dtype = tmp.strip() keys = {} tmp = [] while True: line = lines.pop(0) if line.startswith('</' + dtype + '>'): break tmp += [line] tmp = '\n'.join(tmp) # check for data stuff if dtype == "json": tmp = json.loads(tmp) if not keys: for key in tmp: meta[key] = tmp[key] elif keys: meta[keys["id"]] = {} for k in tmp: meta[keys["id"]][k] = tmp[k] elif dtype in ['tre', 'nwk']: if "trees" not in meta: meta["trees"] = {} if not keys: keys["id"] = "1" # XXX consider switching to Tree here XXX meta['trees'][keys["id"]] = cg.LoadTree(treestring=tmp) elif dtype in ['csv']: meta[keys["id"]] = {} ncol = int(keys.get('ncol', 2)) if "dtype" in keys: transf = eval(keys["dtype"]) else: transf = str # split tmp into lines tmp = tmp.split('\n') for l in tmp: if ncol == 2: a, b = l.split('\t') b = transf(b) else: l = l.split('\t') a = l[0] b = [transf(b) for b in l[1:]] meta[keys["id"]][a] = b elif dtype == 'msa': tmp = tmp.split('\n') if 'msa' not in meta: meta['msa'] = {} ref = keys.get('ref', 'cogid') if ref not in meta['msa']: meta['msa'][ref] = {} tmp_msa = {} try: tmp_msa['dataset'] = meta['dataset'] except: tmp_msa['dataset'] = infile.replace('.csv', '') tmp_msa['seq_id'] = keys['id'] # add consensus string to msa, if it appears in the keys if "consensus" in keys: tmp_msa['consensus'] = keys['consensus'] msad = [] for l in tmp: if not l.startswith(comment): msad.append([x.strip().rstrip('.') for x in l.split('\t')]) tmp_msa = _list2msa(msad, header=False, ids=True, **tmp_msa) try: meta['msa'][ref][int(keys['id'])] = tmp_msa except ValueError: meta['msa'][ref][keys['id']] = tmp_msa elif dtype == 'dst': taxa, matrix = read_dst(tmp) distances = [[0.0 for _ in matrix] for _ in matrix] for i, line in enumerate(matrix): for j, cell in enumerate(line): if i < j: distances[i][j] = cell distances[j][i] = cell meta['distances'] = distances elif dtype == 'scorer': scorer = read_scorer(tmp) if 'scorer' not in meta: meta['scorer'] = {} if 'id' not in keys: keys['id'] = 'basic' meta['scorer'][keys['id']] = scorer elif dtype == 'taxa': meta['taxa'] = [t.strip() for t in tmp.split('\n')] else: data += [[l.strip() for l in line.split('\t')]] # create the dictionary in which the data will be stored d = {} # check for first line, if a local ID is given in the header (or simply # "ID"), take this line as the ID, otherwise create it local_id = data[0][0].lower() in ['id', 'local_id', 'localid'] # iterate over data and fill the dictionary (a bit inefficient, but enough # for the moment) try: i = 1 for j, line in enumerate(data[1:]): if local_id: d[int(line[0])] = line[1:] else: d[i] = line i += 1 except ValueError as e: raise Exception("Error processing line {0}:\n".format(j) + str(data[1:][j]) + '\nOriginal error message: ' + str(e)) # assign the header to d[0] if local_id: d[0] = [x.lower() for x in data[0][1:]] else: d[0] = [x.lower() for x in data[0]] for m in meta: d[m] = meta[m] if 'trees' in d and 'tree' not in d: d['tree'] = sorted(d['trees'].items(), key=lambda x: x[0])[0][1] return d
def __init__(self, filename, conf=''): """ Parse data regularly if the data has not been loaded from a pickled version. """ self.log = log.get_logger() # try to load the data internal_import = False # check whether it's a dictionary from which we load if isinstance(filename, dict): input_data = filename if 'filename' not in input_data: self.filename = rcParams['filename'] internal_import = True # make check for correct input, there was a bug with a wrong # evaluation which is hopefully fixed by now tmp_keys = [k for k in input_data if isinstance(k, int)] if len(input_data[0]) != len(input_data[tmp_keys[0]]): log.warning(input_data[0], input_data[tmp_keys[0]]) raise ValueError("[!] Wrong input format!") # pragma: no cover # check whether it's another wordlist-object elif hasattr(filename, '_data') and hasattr(filename, '_meta'): input_data = dict([(key, [v for v in value]) for key, value in \ filename._data.items()]) input_data.update(filename._meta.items()) input_data[0] = [ a for a, b in sorted( filename.header.items(), key=lambda x: x[1], reverse=False) ] internal_import = True self.filename = rcParams['filename'] # or whether the data is an actual file elif isinstance(filename, string_types) and os.path.isfile(filename): input_data = read_qlc(filename) self.filename = filename # raise an error otherwise elif isinstance(filename, string_types): raise IOError("Input file '{0}' does not exist.".format(filename)) else: raise TypeError( "Unrecognized type for 'filename' argument: {0}".format( type(filename).__name__)) self._alias, self._class, self._class_string, self._alias2 = read_conf( conf) for name in input_data[0]: if name.lower() not in self._alias: self._alias[name.lower()] = name.lower() self._class[name.lower()] = str if name.upper() not in self._alias: self._alias[name.upper()] = name.lower() self._class[name.upper()] = str # add empty alias for empty strings XXX why was that? I can't remember # why this was important XXX self._alias[''] = '' # the header stores the indices of the data in the original data dictionary self.header = dict( zip([self._alias[x] for x in input_data[0]], range(len(input_data[0])))) # now create a specific header which has all aliases self._header = {k: v for k, v in self.header.items()} # add a sorted header for reference self.columns = sorted(self.header, key=lambda x: self.header[x]) # assign all aliases to the header for alias in self._alias: try: self._header[alias] = self._header[self._alias[alias]] except: pass # assign the data as attribute to the word list class. Note that we # need to check for the type here, but since numpy also offers integer # types, we don't check for type(x) == int, but instead use the # str.numeric-function that returns numeric values only if it is an # integer self._data = { int(k): v for k, v in input_data.items() if k != 0 and str(k).isnumeric() } # check for same length of all columns check_errors = '' for k, v in self._data.items(): if len(v) != len(self.header): check_errors += 'Row {0} in your data contains {1} fields (expected {2})\n'.format( k, len(v), len(self.header)) if check_errors: raise ValueError(check_errors + '\n' + ', '.join(sorted(self.header))) # iterate over self._data and change the values according to the # functions (only needed when reading from file) if not internal_import: heads = sorted(self._header.items(), key=lambda x: x[1]) for key in self._data: check = [] for head, i in heads: if i not in check: logstring = 'Problem with row {0} in col {1}, expected' + \ ' «{4}» as datatype but received «{3}» ' + \ ' (ROW: {2}, entry {5}).' try: self._data[key][i] = self._class[head]( self._data[key][i]) check.append(i) except KeyError: log.warning( logstring.format( key, i, '|'.join([str(x) for x in self._data[key] ]), self._data[key][i], self._class[head], head)) except ValueError: log.warning( logstring.format( key, i, '|'.join([str(x) for x in self._data[key] ]), self._data[key][i], self._class[head], head)) # create entry attribute of the wordlist self.entries = sorted( set([b.lower() for a, b in self._alias.items() if b])) # assign meta-data self._meta = {} for key in [k for k in input_data if type(k) != int]: self._meta[key] = input_data[key]
def from_cldf( cls, path, columns=( 'parameter_id', 'concept_name', 'language_id', 'language_name', 'value', 'form', 'segments', 'language_glottocode', 'concept_concepticon_id', 'language_latitude', 'language_longitude', 'cognacy' ), namespace=( ('concept_name', 'concept'), ('language_id', 'doculect'), ('segments', 'tokens'), ('language_glottocode', 'glottolog'), ('concept_concepticon_id', 'concepticon'), ('language_latitude', 'latitude'), ('language_longitude', 'longitude'), ('cognacy', 'cognacy'), ('cogid_cognateset_id', 'cogid') ), filter=lambda row: row["form"], **kwargs): """Load a CLDF dataset. Open a CLDF Dataset – with metadata or metadata-free – (only Wordlist datasets are supported for now, because other modules don't seem to make sense for LingPy) and transform it into this Class. Columns from the FormTable are imported in lowercase, columns from LanguageTable, ParameterTable and CognateTable are prefixed with `langage_`, `concept_` and `cogid_`and converted to lowercase. Notes ----- CLDFs default column names for wordlists are different from LingPy's, so you probably have to use:: >>> lingpy.Wordlist.from_cldf( "Wordlist-metadata.json", ) in order to avoid errors from LingPy not finding required columns. Parameters ---------- columns: list or tuple The list of columns to import. (default: all columns) filter: function: rowdict → bool A condition function for importing only some rows. (default: lambda row: row["form"]) All other parameters are passed on to the `cls` Returns ------- A `cls` object representing the CLDF dataset """ kw = { 'row': 'concept', 'col': 'doculect', 'conf': util.data_path('conf', 'wordlist.rc'), } kwargs.update(kw) if isinstance(namespace, tuple): namespace = dict(namespace) # get the datatypes from configuration as to namespace datatypes = read_conf(kwargs['conf'])[1] # Load the dataset. fname = Path(path) if not fname.exists(): raise FileNotFoundError('{:} does not exist'.format(fname)) if fname.suffix == '.json': dataset = pycldf.dataset.Dataset.from_metadata(fname) else: dataset = pycldf.dataset.Dataset.from_data(fname) if dataset.module == "Wordlist": # First, make a list of cognate codes if they are in a separate table. cognateset_assignments = {} try: form_reference = dataset["CognateTable", "formReference"].name for row in dataset["CognateTable"].iterdicts(): cognateset_assignments[row[form_reference]] = row except KeyError: # Either there are no cognate codes, or they are in the form # table. Both options are fine. pass f_id = dataset["FormTable", "id"].name # Access columns by type, not by name. language_column = dataset["FormTable", "languageReference"].name parameter_column = dataset["FormTable", "parameterReference"].name try: l_id = dataset["LanguageTable", "id"].name languages = {l[l_id]: l for l in dataset["LanguageTable"].iterdicts()} except KeyError: l_id = "ID" languages = bounce_as_id try: c_id = dataset["ParameterTable", "id"].name concepts = {c[c_id]: c for c in dataset["ParameterTable"].iterdicts()} except KeyError: c_id = "ID" concepts = bounce_as_id # create dictionary D = {0: columns} # Reserve the header for row in dataset["FormTable"].iterdicts(): # TODO: Improve prefixing behaviour s = {"cogid_{:}".format(key).lower(): value for key, value in cognateset_assignments.get( row[f_id], {}).items()} s.update( {"language_{:}".format(key).lower(): value for key, value in languages[row[language_column]].items()}) s.update( {"concept_{:}".format(key).lower(): value for key, value in concepts[row[parameter_column]].items()}) s.update({k.lower(): v for k, v in row.items()}) if not filter(s): continue # check for numeric ID try: idx = int(row[f_id]) except ValueError: idx = len(D) while idx in D: idx += 1 if not D[0]: columns = list(s.keys()) D[0] = [c.lower() for c in columns] D[idx] = [datatypes.get( namespace.get( column, ''), lambda x: x)( s.get(column, '')) for column in columns] D[0] = [namespace.get(c, c) for c in columns] if len(D[0]) != len(set(D[0])): log.warning('|'.join(columns)) log.warning('|'.join(D[0])) raise ValueError('name space clashes, cannot parse data') # convert to wordlist and return return cls(D, **kwargs) else: # For most LingPy applications, it might be best to see whether we got # a Wordlist module. raise ValueError("LingPy has no procedures for CLDF {:} data.".format( dataset.module))
def read_qlc(infile, comment='#'): """ Simple function that loads qlc-format into a dictionary. Parameters ---------- infile : str The name of the input file. comment : str (default="#") The comment character. If a line starts with this character, it will be ignored. Returns ------- d : dict A dictionary with integer keys corresponding to the order of the lines of the input file. The header is given 0 as a specific key. """ lines = read_text_file(infile, lines=True, normalize="NFC") data, meta, dtype = [], {}, False while lines: line = lines.pop(0) if line.startswith(comment) or not line: continue if line.startswith('@'): key, value = [s.strip() for s in line[1:].split(':', 1)] if key == 'tree': meta["tree"] = cg.LoadTree(treestring=value) elif key == 'json': for j1, j2 in json.loads(value).items(): meta[j1] = j2 else: if key not in meta: meta[key] = value else: if isinstance(meta[key], list): meta[key].append(value) else: log.warning( "Key '{0}' in input file is not unique! Use JSON-format for " "these datatypes!".format(key)) meta[key] = [meta[key]] + [value] # line starts with complex stuff elif line.startswith('<'): tmp = line[1:line.index('>')] # check for specific keywords if ' ' in tmp: dtype = tmp.split(' ')[0] keys = { k: v[1:-1] for k, v in [key.split('=') for key in tmp.split(' ')[1:]] } else: dtype = tmp.strip() keys = {} tmp = [] while True: line = lines.pop(0) if line.startswith('</' + dtype + '>'): break tmp += [line] tmp = '\n'.join(tmp) # check for data stuff if dtype == "json": tmp = json.loads(tmp) if not keys: for key in tmp: meta[key] = tmp[key] elif keys: meta[keys["id"]] = {} for k in tmp: meta[keys["id"]][k] = tmp[k] elif dtype in ['tre', 'nwk']: if "trees" not in meta: meta["trees"] = {} if not keys: keys["id"] = "1" # XXX consider switching to Tree here XXX meta['trees'][keys["id"]] = cg.LoadTree(treestring=tmp) elif dtype in ['csv']: meta[keys["id"]] = {} ncol = int(keys.get('ncol', 2)) if "dtype" in keys: transf = eval(keys["dtype"]) else: transf = str # split tmp into lines tmp = tmp.split('\n') for l in tmp: if ncol == 2: a, b = l.split('\t') b = transf(b) else: l = l.split('\t') a = l[0] b = [transf(b) for b in l[1:]] meta[keys["id"]][a] = b elif dtype == 'msa': tmp = tmp.split('\n') if 'msa' not in meta: meta['msa'] = {} ref = keys.get('ref', 'cogid') if ref not in meta['msa']: meta['msa'][ref] = {} tmp_msa = {} try: tmp_msa['dataset'] = meta['dataset'] except: tmp_msa['dataset'] = infile.replace('.csv', '') tmp_msa['seq_id'] = keys['id'] # add consensus string to msa, if it appears in the keys if "consensus" in keys: tmp_msa['consensus'] = keys['consensus'] msad = [] for l in tmp: if not l.startswith(comment): msad.append( [x.strip().rstrip('.') for x in l.split('\t')]) tmp_msa = _list2msa(msad, header=False, ids=True, **tmp_msa) try: meta['msa'][ref][int(keys['id'])] = tmp_msa except ValueError: meta['msa'][ref][keys['id']] = tmp_msa elif dtype == 'dst': taxa, matrix = read_dst(tmp) distances = [[0.0 for _ in matrix] for _ in matrix] for i, line in enumerate(matrix): for j, cell in enumerate(line): if i < j: distances[i][j] = cell distances[j][i] = cell meta['distances'] = distances elif dtype == 'scorer': scorer = read_scorer(tmp) if 'scorer' not in meta: meta['scorer'] = {} keys.setdefault('id', 'basic') meta['scorer'][keys['id']] = scorer elif dtype == 'taxa': meta['taxa'] = [t.strip() for t in tmp.split('\n')] else: data += [[l.strip() for l in line.split('\t')]] # create the dictionary in which the data will be stored d = {} # check for first line, if a local ID is given in the header (or simply # "ID"), take this line as the ID, otherwise create it local_id = data[0][0].lower() in ['id', 'local_id', 'localid'] # iterate over data and fill the dictionary (a bit inefficient, but enough # for the moment) try: i = 1 for j, line in enumerate(data[1:]): if local_id: d[int(line[0])] = line[1:] else: d[i] = line i += 1 except ValueError as e: # pragma: no cover raise Exception("Error processing line {0}:\n".format(j) + str(data[1:][j]) + '\nOriginal error message: ' + str(e)) # assign the header to d[0] if local_id: d[0] = [x.lower() for x in data[0][1:]] else: d[0] = [x.lower() for x in data[0]] for m in meta: d[m] = meta[m] if 'trees' in d and 'tree' not in d: d['tree'] = sorted(d['trees'].items(), key=lambda x: x[0])[0][1] return d
def pid(almA, almB, mode=2): """ Calculate the Percentage Identity (PID) score for aligned sequence pairs. Parameters ---------- almA, almB : string or list The aligned sequences which can be either a string or a list. mode : { 1, 2, 3, 4, 5 } Indicate which of the four possible PID scores described in :evobib:`Raghava2006` should be calculated, the fifth possibility is added for linguistic purposes: 1. identical positions / (aligned positions + internal gap positions), 2. identical positions / aligned positions, 3. identical positions / shortest sequence, or 4. identical positions / shortest sequence (including internal gap pos.) 5. identical positions / (aligned positions + 2 * number of gaps) Returns ------- score : float The PID score of the given alignment as a floating point number between 0 and 1. Notes ----- The PID score is a common measure for the diversity of a given alignment. The implementation employed by LingPy follows the description of :evobib:`Raghava2006` where four different variants of PID scores are distinguished. Essentially, the PID score is based on the comparison of identical residue pairs with the total number of residue pairs in a given alignment. Examples -------- Load an alignment from the test suite. >>> from lingpy import * >>> pairs = PSA(get_file('test.psa')) Extract the alignments of the first aligned sequence pair. >>> almA,almB,score = pairs.alignments[0] Calculate the PID score of the alignment. >>> pid(almA,almB) 0.44444444444444442 See also -------- lingpy.compare.Multiple.get_pid .. todo:: change debug for ZeroDivisionError """ zipped = zip(almA, almB) idn_pos = 0 int_gps = 0 aln_pos = 0 for charA, charB in zipped: tmp = [charA, charB].count('-') if tmp == 1: int_gps += 1 elif tmp == 0 and charA == charB: idn_pos += 1 aln_pos += 1 elif tmp == 0: aln_pos += 1 if mode == 2: try: return idn_pos / (aln_pos + int_gps) except ZeroDivisionError: log.warning('Zero Division Error in {0} and {1}'.format(almA, almB)) return 0 elif mode == 1: try: return idn_pos / aln_pos except ZeroDivisionError: log.warning('Zero Division Error in {0} and {1}'.format(almA, almB)) return 0 elif mode == 3: srt_seq = min( len([i for i in almA if i != '-']), len([i for i in almB if i != '-'])) try: return idn_pos / srt_seq except ZeroDivisionError: log.warning('Zero Division Error in {0} and {1}'.format(almA, almB)) return 0 elif mode == 4: srt_seq = min( len(''.join([i[0] for i in almA]).strip('-')), len(''.join([i[0] for i in almB]).strip('-'))) try: return idn_pos / srt_seq except ZeroDivisionError: log.warning('Zero Division Error in {0} and {1}'.format(almA, almB)) return 0 elif mode == 5: return idn_pos / len(almA)
def get_partial_scorer(self, **keywords): """ Create a scoring function based on sound correspondences. Parameters ---------- method : str (default='shuffle') Select between "markov", for automatically generated random strings, and "shuffle", for random strings taken directly from the data. ratio : tuple (default=3,2) Define the ratio between derived and original score for sound-matches. vscale : float (default=0.5) Define a scaling factor for vowels, in order to decrease their score in the calculations. runs : int (default=1000) Choose the number of random runs that shall be made in order to derive the random distribution. threshold : float (default=0.7) The threshold which used to select those words that are compared in order to derive the attested distribution. modes : list (default = [("global",-2,0.5),("local",-1,0.5)]) The modes which are used in order to derive the distributions from pairwise alignments. factor : float (default=0.3) The scaling factor for sound segments with identical prosodic environment. force : bool (default=False) Force recalculation of existing distribution. preprocessing: bool (default=False) Select whether SCA-analysis shall be used to derive a preliminary set of cognates from which the attested distribution shall be derived. rands : int (default=1000) If "method" is set to "markov", this parameter defines the number of strings to produce for the calculation of the random distribution. limit : int (default=10000) If "method" is set to "markov", this parameter defines the limit above which no more search for unique strings will be carried out. cluster_method : {"upgma" "single" "complete"} (default="upgma") Select the method to be used for the calculation of cognates in the preprocessing phase, if "preprocessing" is set to c{True}. gop : int (default=-2) If "preprocessing" is selected, define the gap opening penalty for the preprocessing calculation of cognates. unattested : {int, float} (default=-5) If a pair of sounds is not attested in the data, but expected by the alignment algorithm that computes the expected distribution, the score would be -infinity. Yet in order to allow to smooth this behaviour and to reduce the strictness, we set a default negative value which does not necessarily need to be too high, since it may well be that we miss a potentially good pairing in the first runs of alignment analyses. Use this keyword to adjust this parameter. unexpected : {int, float} (default=0.000001) If a pair is encountered in a given alignment but not expected according to the randomized alignments, the score would be not calculable, since we had to divide by zero. For this reason, we set a very small constant, by which the score is divided in this case. Not that this constant is only relevant in those cases where the shuffling procedure was not carried out long enough. """ kw = dict( method=rcParams['lexstat_scoring_method'], ratio=rcParams['lexstat_ratio'], vscale=rcParams['lexstat_vscale'], runs=rcParams['lexstat_runs'], threshold=rcParams['lexstat_scoring_threshold'], modes=rcParams['lexstat_modes'], factor=rcParams['align_factor'], restricted_chars=rcParams['restricted_chars'], force=False, preprocessing=False, rands=rcParams['lexstat_rands'], limit=rcParams['lexstat_limit'], cluster_method=rcParams['lexstat_cluster_method'], gop=rcParams['align_gop'], preprocessing_threshold=rcParams[ 'lexstat_preprocessing_threshold'], preprocessing_method=rcParams['lexstat_preprocessing_method'], subset=False, defaults=False, unattested=-5, unexpected=0.00001, smooth=1) kw.update(keywords) if kw['defaults']: return kw # get parameters and store them in string params = dict( ratio=kw['ratio'], vscale=kw['vscale'], runs=kw['runs'], scoring_threshold=kw['threshold'], preprocessing_threshold=kw['preprocessing_threshold'], modestring=':'.join('{0}-{1}-{2:.2f}'.format(a, abs(b), c) for a, b, c in kw['modes']), factor=kw['factor'], restricted_chars=kw['restricted_chars'], method=kw['method'], preprocessing='{0}:{1}:{2}'.format(kw['preprocessing'], kw['cluster_method'], kw['gop']), unattested=kw['unattested'], unexpected=kw['unexpected'], smooth=kw['smooth']) parstring = '_'.join([ '{ratio[0]}:{ratio[1]}', '{vscale:.2f}', '{runs}', '{scoring_threshold:.2f}', '{modestring}', '{factor:.2f}', '{restricted_chars}', '{method}', '{preprocessing}', '{preprocessing_threshold}', '{unexpected:.2f}', '{unattested:.2f}' ]).format(**params) # check for existing attributes if hasattr(self, 'cscorer') and not kw['force']: log.warning( "An identical scoring function has already been calculated, " "force recalculation by setting 'force' to 'True'.") return # check for attribute if hasattr(self, 'params') and not kw['force']: if 'cscorer' in self.params: if self.params['cscorer'] == params: log.warning( "An identical scoring function has already been " "calculated, force recalculation by setting 'force'" " to 'True'.") return else: log.warning( "A different scoring function has already been calculated, " "overwriting previous settings.") # store parameters self.params = {'cscorer': params} self._meta['params'] = self.params self._stamp += "# Parameters: " + parstring + '\n' # get the correspondence distribution self._corrdist = self._get_partial_corrdist(**kw) # get the random distribution self._randist = self._get_partial_randist(**kw) # get the average gop gop = sum([m[1] for m in kw['modes']]) / len(kw['modes']) # create the new scoring matrix matrix = [[c for c in line] for line in self.bscorer.matrix] char_dict = self.bscorer.chars2int for (i, tA), (j, tB) in util.multicombinations2(enumerate(self.cols)): for charA, charB in product( list(self.freqs[tA]) + [util.charstring(i + 1)], list(self.freqs[tB]) + [util.charstring(j + 1)]): exp = self._randist.get((tA, tB), {}).get((charA, charB), False) att = self._corrdist.get((tA, tB), {}).get((charA, charB), False) # in the following we follow the former lexstat protocol if att <= kw['smooth'] and i != j: att = False if att and exp: score = np.log2((att**2) / (exp**2)) elif att and not exp: score = np.log2((att**2) / kw['unexpected']) elif exp and not att: score = kw['unattested'] # XXX gop ??? else: # elif not exp and not att: score = -90 # ??? # combine the scores if rcParams['gap_symbol'] not in charA + charB: sim = self.bscorer[charA, charB] else: sim = gop # get the real score rscore = (kw['ratio'][0] * score + kw['ratio'][1] * sim) \ / sum(kw['ratio']) try: iA = char_dict[charA] iB = char_dict[charB] # use the vowel scale if charA[4] in self.vowels and charB[4] in self.vowels: matrix[iA][iB] = matrix[iB][iA] = kw['vscale'] * rscore else: matrix[iA][iB] = matrix[iB][iA] = rscore except: pass self.cscorer = misc.ScoreDict(self.chars, matrix) self._meta['scorer']['cscorer'] = self.cscorer
def psa2html(infile, **kw): """ Function converts a PSA-file into colored html-format. """ util.setdefaults(kw, template=False, css=False, comment='#', filename=infile[:-4] + '.html', compact=True) template = util.read_text_file(kw['template'] or template_path('psa.html')) css = util.read_text_file(kw['css'] or template_path('psa.css')) data = [] for line in util.read_text_file(infile, lines=True): if not line.startswith(kw['comment']): data.append(line) seq_ids = [] pairs = [] taxa = [] alignments = [] del data[0] i = 0 while i <= len(data) - 3: try: seq_ids.append(data[i]) datA = data[i + 1].split('\t') datB = data[i + 2].split('\t') taxonA = datA[0].strip('.') taxonB = datB[0].strip('.') almA = datA[1:] almB = datB[1:] taxa.append((taxonA, taxonB)) pairs.append(('.'.join([k for k in almA if k != '-']), '.'.join([k for k in almB if k != '-']))) alignments.append( ([str(a) for a in almA], [str(b) for b in almB], 0)) assert len(alignments[-1][0]) == len(alignments[-1][1]) i += 4 except AssertionError: log.warning("Line {0} of the data is probably miscoded.".format(i + 1)) i += 1 def get_classes(alm): classes = [] residue = '<div class="residue {1}">{0}</div>' for j, char in enumerate(alm): if char == '-': d = 'dolgo_GAP' else: d = 'dolgo_' + token2class(char, rcParams['dolgo']) # bad check for three classes named differently if d == 'dolgo__': d = 'dolgo_X' elif d == 'dolgo_1': d = 'dolgo_TONE' elif d == 'dolgo_0': d = 'dolgo_ERROR' classes += [residue.format(char, d)] return ''.join(classes) out = '<table>\n' # codecs.open(kw['filename'], 'w', 'utf-8') for i, (a, b, c) in enumerate(alignments): clsA = get_classes(a) clsB = get_classes(b) ids = int(100 * pid(a, b) + 0.5) out += '<tr class="head">' out += '<td colspan=2 class="head"><b>Alignment {0}:</b> <i>{1}</i>, PID: {2}</td></tr>'.format( i + 1, seq_ids[i], ids) out += '<tr class="psa">' out += '<td class="taxon">{0}</td>'.format(taxa[i][0]) out += '<td class="psa">{0}</td>'.format(clsA) out += '</tr>' out += '<tr class="psa">' out += '<td class="taxon">{0}</td>'.format(taxa[i][1]) out += '<td class="psa">{0}</td>'.format(clsB) out += '</tr>' out += '<tr><td colspan=2></td></tr>' out += '</table>' html = template.format(alignments=out, css=css) if kw['compact']: html = html.replace('\n', ' ') html = re.sub(r'\s+', r' ', html) html = html.replace('> ', '>') html = html.replace(' >', '>') util.write_text_file(kw['filename'], html)
def tokens2morphemes(tokens, **keywords): """ Split a string into morphemes if it contains separators. Notes ----- Function splits a list of tokens into subsequent lists of morphemes if the list contains morpheme separators. If no separators are found, but tonemarkers, it will still split the string according to the tones. If you want to avoid this behavior, set the keyword **split_on_tones** to False. Parameters ---------- sep : str (default="◦") Select your morpheme separator. word_sep: str (default="_") Select your word separator. Returns ------- morphemes : list A nested list of the original segments split into morphemes. """ if not isinstance(tokens, (list, tuple)): raise ValueError("The sequence needs to be a list or a tuple.") kw = { "sep": rcParams['morpheme_separator'], "word_sep": rcParams['word_separator'], "word_seps": rcParams['word_separators'], "seps": rcParams['morpheme_separators'], "split_on_tones": True, "tone": "T", "cldf": False } kw.update(keywords) if not kw['split_on_tones']: kw['tone'] = '' # check for other hints than the clean separators in the data new_tokens = [t for t in tokens] if not kw['sep'] in tokens and not kw['word_sep'] in tokens: class_string = tokens2class(tokens, 'cv', cldf=kw['cldf']) if kw['tone'] in class_string \ and '+' not in class_string and '_' not in class_string: new_tokens = [] for i, token in enumerate(tokens): if class_string[i] == kw['tone'] and i != len(class_string) - 1: new_tokens += [token, kw['sep']] else: new_tokens += [token] out = [[]] for i, token in enumerate(new_tokens): if token not in kw['sep'] + kw['word_sep'] + kw['word_seps'] + kw['seps']: out[-1] += [token] else: out += [[]] # check for bad examples if ['' for x in out if not x]: log.warning("[!] Your data contains empty morpheme segments.") out = [x for x in out if x] return out
def __init__(self, filename, conf=''): """ Parse data regularly if the data has not been loaded from a pickled version. """ self.log = log.get_logger() # try to load the data internal_import = False # check whether it's a dictionary from which we load if isinstance(filename, dict): input_data = filename if 'filename' not in input_data: self.filename = rcParams['filename'] internal_import = True # make check for correct input, there was a bug with a wrong # evaluation which is hopefully fixed by now tmp_keys = [k for k in input_data if isinstance(k, int)] if len(input_data[0]) != len(input_data[tmp_keys[0]]): print(input_data[0], input_data[tmp_keys[0]]) raise ValueError("[!] Wrong input format!") # pragma: no cover # check whether it's another wordlist-object elif hasattr(filename, '_data') and hasattr(filename, '_meta'): input_data = dict([(key, [v for v in value]) for key, value in \ filename._data.items()]) input_data.update(filename._meta.items()) input_data[0] = [a for a, b in sorted( filename.header.items(), key=lambda x: x[1], reverse=False)] internal_import = True self.filename = rcParams['filename'] # or whether the data is an actual file elif isinstance(filename, string_types) and os.path.isfile(filename): input_data = read_qlc(filename) self.filename = filename # raise an error otherwise elif isinstance(filename, string_types): raise IOError("Input file '{0}' does not exist.".format(filename)) else: raise TypeError("Unrecognized type for 'filename' argument: {0}".format( type(filename).__name__)) # load the configuration file if not conf: conf = util.data_path('conf', 'qlc.rc') # read the file defined by its path in conf tmp = [line.split('\t') for line in util.read_config_file(conf)] # define two attributes, _alias, and _class which store the aliases and # the datatypes (classes) of the given entries self._alias, self._class, self._class_string, self._alias2 = {}, {}, {}, {} for name, cls, alias in tmp: # make sure the name itself is there self._alias[name.lower()] = self._alias[name.upper()] = name self._class[name.lower()] = self._class[name.upper()] = eval(cls) self._class_string[name.lower()] = self._class_string[name.upper()] = cls # add the aliases for a in alias.split(','): self._alias[a.lower()] = self._alias[a.upper()] = name self._class[a.lower()] = self._class[a.upper()] = eval(cls) self._class_string[a.lower()] = self._class_string[a.upper()] = cls self._alias2[name] = sorted(set(alias.split(','))) + [name] # append the names in data[0] to self.conf to make sure that all data # is covered, even the types which are not specifically defined in the # conf file. the datatype defaults here to "str" for name in input_data[0]: if name.lower() not in self._alias: self._alias[name.lower()] = name.lower() self._class[name.lower()] = str if name.upper() not in self._alias: self._alias[name.upper()] = name.lower() self._class[name.upper()] = str # add empty alias for empty strings XXX why was that? I can't remember # why this was important XXX self._alias[''] = '' # the header stores the indices of the data in the original data dictionary self.header = dict( zip([self._alias[x] for x in input_data[0]], range(len(input_data[0])))) # now create a specific header which has all aliases self._header = {k: v for k, v in self.header.items()} # add a sorted header for reference self.columns = sorted(self.header, key=lambda x: self.header[x]) # assign all aliases to the header for alias in self._alias: try: self._header[alias] = self._header[self._alias[alias]] except: pass # assign the data as attribute to the word list class. Note that we # need to check for the type here, but since numpy also offers integer # types, we don't check for type(x) == int, but instead use the # str.numeric-function that returns numeric values only if it is an # integer self._data = { int(k): v for k, v in input_data.items() if k != 0 and str(k).isnumeric()} # check for same length of all columns check_errors = '' for k, v in self._data.items(): if len(v) != len(self.header): check_errors += 'Row {0} in your data contains {1} fields (expected {2})\n'.format( k, len(v), len(self.header)) if check_errors: raise ValueError(check_errors + '\n' + ', '.join(sorted(self.header))) # iterate over self._data and change the values according to the # functions (only needed when reading from file) if not internal_import: heads = sorted(self._header.items(), key=lambda x: x[1]) for key in self._data: check = [] for head, i in heads: if i not in check: logstring = 'Problem with row {0} in col {1}, expected' + \ ' «{4}» as datatype but received «{3}» ' + \ ' (ROW: {2}, entry {5}).' try: self._data[key][i] = self._class[head](self._data[key][i]) check.append(i) except KeyError: log.warning( logstring.format( key, i, '|'.join([str(x) for x in self._data[key]]), self._data[key][i], self._class[head], head)) except ValueError: log.warning( logstring.format( key, i, '|'.join([str(x) for x in self._data[key]]), self._data[key][i], self._class[head], head)) # create entry attribute of the wordlist self.entries = sorted(set([b.lower() for a, b in self._alias.items() if b])) # assign meta-data self._meta = {} for key in [k for k in input_data if type(k) != int]: self._meta[key] = input_data[key]
def plot_heatmap(wordlist, filename="heatmap", fileformat="pdf", ref='cogid', normalized=False, refB='', **keywords): """ Create a heatmap-representation of shared cognates for a given wordlist. Parameters ---------- wordlist : lingpy.basic.wordlist.Wordlist A Wordlist object containing cognate IDs. filename : str (default="heatmap") Name of the file to which the heatmap will be written. fileformat : str (default="pdf") A regular matplotlib-fileformat (pdf, png, pgf, svg). ref : str (default="cogid') The name of the column that contains the cognate identifiers. normalized : {bool str} (default=True) If set to c{False}, don't normalize the data. Otherwise, select the normalization method, choose between: * "jaccard" for the Jaccard-distance (see :evobib:`Bategelj1995` for details), and * "swadesh" for traditional lexicostatistical calculation of shared cognate percentages. cmap : matplotlib.cm (default=matplotlib.cm.jet) The color scheme to be used for the heatmap. steps : int (default=5) The number of steps in which names of taxa will be written to the axes. xrotation : int (default=45) The rotation of the taxon-names on the x-axis. colorbar : bool (default=True) Specify, whether a colorbar should be added to the plot. figsize : tuple (default=(10,10)) Specify the size of the figure. tree : str (default='') A tree passed for the taxa in Newick-format. If no tree is specified, the method looks for a tree object in the Wordlist. Notes ----- This function plots shared cognate percentages. """ defaults = dict( bottom=0.01, # rcParams['phybo_ylimb'] cmap=mpl.cm.jet, colorbar=True, colorbar_label="Shared Cognates", colorbar_shrink=0.75, colorbar_textsize=10, figsize=(10, 5), height=0.8, labels={}, # taxon labels passed for the taxa, left=0.01, # rcParams['phybo_xlimr'], matrix=False, normalization="jaccard", right=0.95, # rcParams['phybo_xliml'], scale=0.075, show_tree=True, steps=20, textsize=5, top=0.95, # rcParams['phybo_ylimt'], tree='', tree_bottom=0.1, tree_left=0.1, tree_width=0.2, vmax=1.0, vmin=0.0, width=0.8, xrotation=90, distances=False) for k in defaults: if k not in keywords: keywords[k] = defaults[k] # access the reference tree of the wordlist and create a function that # orders the taxa accordingly if not keywords['tree']: try: tree = wordlist.tree except: raise ValueError("[i] No tree could be found") else: tree = keywords["tree"] # check for normalization if normalized: if normalized not in ["jaccard", "swadesh"]: raise ValueError( "Keyword 'normalized' must be one of 'jaccard','swadesh',False." ) # create an empty matrix if not normalized: matrix = np.zeros((wordlist.width, wordlist.width), dtype=int) else: matrix = np.zeros((wordlist.width, wordlist.width), dtype=float) # create the figure fig = plt.figure(figsize=keywords['figsize']) # plot the reference tree if keywords['show_tree']: tree_matrix, taxa = nwk2tree_matrix(tree) ax1 = fig.add_axes([ keywords['left'], keywords['bottom'], 0.25 * keywords['width'], keywords['height'] ]) # [0.01,0.1,0.2,0.7]) d = sch.dendrogram( np.array(tree_matrix), labels=[t for t in taxa], orientation='left', ) taxa = d['ivl'][::-1] ax1.set_xticks([]) ax1.set_yticks([]) ax1.spines['bottom'].set_color('#ffffff') ax1.spines['top'].set_color('#ffffff') ax1.spines['left'].set_color('#ffffff') ax1.spines['right'].set_color('#ffffff') left = keywords['left'] + keywords['scale'] * keywords['width'] else: left = keywords['left'] taxa = tree.taxa # start iterating over taxa in order of the reference tree and fill in the # matrix with numbers of shared cognates if keywords['matrix']: matrix = keywords['matrix'] else: for i, taxonA in enumerate(taxa): for j, taxonB in enumerate(taxa): if i < j: if normalized in [False, "jaccard"]: cogsA = wordlist.get_list(taxa=taxonA, flat=True, entry=ref) cogsB = wordlist.get_list(taxa=taxonB, flat=True, entry=ref) cogsA, cogsB = set(cogsA), set(cogsB) shared = len(cogsA.intersection(cogsB)) if normalized: shared = shared / len(cogsA.union(cogsB)) else: cogsA = wordlist.get_dict(taxa=taxonA, entry=ref) cogsB = wordlist.get_dict(taxa=taxonB, entry=ref) shared = 0 slots = 0 # iterate over cognate sets in meaning slots for key in cogsA.keys(): # check whether keys are present, we follow the # STARLING procedure in ignoring missing data if key in cogsA and key in cogsB: # check for shared items if [k for k in cogsA[key] if k in cogsB[key]]: shared += 1 slots += 1 try: shared = shared / slots except ZeroDivisionError: log.warning( str([ shared, slots, len(cogsA), len(cogsB), taxonA, taxonB ])) shared = 0.0 matrix[i][j] = shared # if refB is also a possibiltiy if not refB: matrix[j][i] = shared elif i > j and refB: if normalized in [False, "jaccard"]: cogsA = wordlist.get_list(taxa=taxonA, flat=True, entry=refB) cogsB = wordlist.get_list(taxa=taxonB, flat=True, entry=refB) cogsA, cogsB = set(cogsA), set(cogsB) shared = len(cogsA.intersection(cogsB)) if normalized: shared = shared / len(cogsA.union(cogsB)) else: cogsA = wordlist.get_dict(taxa=taxonA, entry=refB) cogsB = wordlist.get_dict(taxa=taxonB, entry=refB) shared = 0 slots = 0 # iterate over cognate sets in meaning slots for key in cogsA.keys(): # check whether keys are present, we follow the # STARLING procedure in ignoring missing data if key in cogsA and key in cogsB: # check for shared items if [k for k in cogsA[key] if k in cogsB[key]]: shared += 1 slots += 1 try: shared = shared / slots except ZeroDivisionError: log.warning( str([ shared, slots, len(cogsA), len(cogsB), taxonA, taxonB ])) shared = 0.0 matrix[i][j] = shared elif i == j: cogs = wordlist.get_list(taxa=taxonA, flat=True, entry=ref) if normalized: matrix[i][j] = 1.0 else: matrix[i][j] = len(set(cogs)) ax2 = fig.add_axes([ left, # keywords['left']+0.25 * keywords['width']+0.05, keywords['bottom'], keywords['width'], keywords['height'] ]) cmap = keywords['cmap'] # [0.15,0.1,0.7,0.7]) if 'distances' in keywords and keywords['distances']: for i, line in enumerate(matrix): for j, cell in enumerate(matrix): matrix[i][j] = 1 - matrix[i][j] nmatrix = [[keywords['vmax'], keywords['vmin']], [keywords['vmin'], keywords['vmax']]] im = ax2.matshow(nmatrix, aspect='auto', origin='lower', interpolation='nearest', cmap=keywords['cmap'], vmax=keywords['vmax'], vmin=keywords['vmin']) # set the xticks steps = int(len(taxa) / keywords['steps'] + 0.5) start = int(steps / 2 + 0.5) idxs = [0] + list(range(start, len(taxa), steps)) selected_taxa = [taxa[i] for i in idxs] # modify taxon names if this is specified for i, t in enumerate(selected_taxa): if t in keywords['labels']: selected_taxa[i] = keywords['labels'][t] ax2.set_xticks([]) ax2.set_yticks([]) plt.xticks(idxs, selected_taxa, size=keywords['textsize'], rotation=keywords['xrotation'], rotation_mode="default") plt.yticks( idxs, selected_taxa, size=keywords['textsize'], ) if keywords["colorbar"]: plt.imshow(matrix, cmap=keywords['cmap'], visible=False, vmax=keywords['vmax']) c = plt.colorbar(im, shrink=keywords['colorbar_shrink']) c.set_label(keywords["colorbar_label"], size=keywords['colorbar_textsize']) plt.subplots_adjust(left=keywords['left'], right=keywords['right'], top=keywords['top'], bottom=keywords['bottom']) plt.savefig(filename + '.' + fileformat) f = open(filename + '.matrix', 'w') for i, t in enumerate(taxa): f.write('{0:20}'.format(t)) for j, c in enumerate(matrix[i]): if not normalized: f.write('\t{0:3}'.format(int(c))) else: f.write('\t{0:.2f}'.format(c)) f.write('\n') f.close() log.file_written(filename + '.' + fileformat)
def context_profile(wordlist, ref='ipa', col="doculect", semi_diacritics='hsʃ̢ɕʂʐʑʒw', merge_vowels=False, brackets=None, splitters='/,;~', merge_geminates=True, clts=False, bad_word="<???>", bad_sound="<?>", unknown_sound="!{0}", examples=2, max_entries=100, normalization_form="NFC"): """ Create an advanced Orthography Profile with context and doculect information. Parameters ---------- wordlist : ~lingpy.basic.wordlist.Wordlist A wordlist from which you want to derive an initial orthography profile. ref : str (default="ipa") The name of the reference column in which the words are stored. col : str (default="doculect") Indicate in which column the information on the language variety is stored. semi_diacritics : str Indicate characters which can occur both as "diacritics" (second part in a sound) or alone. merge_vowels : bool (default=True) Indicate whether consecutive vowels should be merged. brackets : dict A dictionary with opening brackets as key and closing brackets as values. Defaults to a pre-defined set of frequently occurring brackets. splitters : str The characters which force the automatic splitting of an entry. clts : dict (default=None) A dictionary(like) object that converts a given source sound into a potential target sound, using the get()-method of the dictionary. Normally, we think of a CLTS instance here (that is: a cross-linguistic transcription system as defined in the pyclts package). bad_word : str (default="«???»") Indicate how words that could not be parsed should be handled. Note that both "bad_word" and "bad_sound" are format-strings, so you can add formatting information here. bad_sound : str (default="«?»") Indicate how sounds that could not be converted to a sound class be handled. Note that both "bad_word" and "bad_sound" are format-strings, so you can add formatting information here. unknown_sound : str (default="!{0}") If with_clts is set to True, use this string to indicate that sounds are classified as "unknown sound" in the CLTS framework. examples : int(default=2) Indicate the number of examples that should be printed out. Returns ------- profile : generator A generator of tuples (three items), indicating the segment, its frequency, the conversion to sound classes in the Dolgopolsky sound-class model, and the unicode-codepoints. """ clts_ = clts or {} nulls = set() bad_words = set() brackets = brackets or "([{『(₍⁽«)]})』⁾₎" profile = defaultdict(list) errors = set() for idx, word, language in pb(wordlist.iter_rows(ref, col), desc='iter words', total=len(wordlist)): log.info('processing {0}-{1}'.format(idx, word)) if isinstance(word, list): word = ' '.join(word) word = unicodedata.normalize(normalization_form, word) if word.strip(): try: cleaned_string = clean_string( word, semi_diacritics=semi_diacritics, merge_vowels=merge_vowels, brackets=None, ignore_brackets=False, normalization_form=normalization_form, split_entries=False, preparse=None, rules=None, merge_geminates=merge_geminates)[0].split(' ') # retain whole word if there are splitters in the word if [x for x in cleaned_string if x in brackets + splitters]: profile[word] += [(language, word)] bad_words.add(word) else: context_pre = ['^'] + (len(cleaned_string) - 1) * [''] context_post = (len(cleaned_string) - 1) * [''] + ['$'] for ctxA, ctxB, segment in zip(context_pre, context_post, cleaned_string): profile[ctxA + segment + ctxB] += [(language, word)] for segment in [ x for x in word if x not in ' '.join(cleaned_string) ]: if segment.strip(): profile[segment] += [(language, word)] nulls.add(segment) except: errors.add(idx) log.warning('problem parsing {0}'.format(word)) for s in '^$': yield s, 'NULL', '', '', '', '' for idx, (s, entries) in pb(enumerate( sorted(profile.items(), key=lambda x: len(x[1]), reverse=True)), desc='yielding entries', total=len(profile)): sclass = token2class(s.strip('^$'), 'dolgo') words, langs = [l[1] for l in entries ][:max_entries], [l[0] for l in entries][:max_entries] languages = ', '.join( sorted(set(langs), key=lambda x: langs.count(x), reverse=True)) frequency = str(len(langs)) codepoints = codepoint(s) examples_ = ', '.join( sorted(set(words), key=lambda x: words.count(x), reverse=True)[:examples]) if s in bad_words: ipa = bad_word.format(s) elif sclass == '0': ipa = bad_sound.format(s) elif s in nulls: ipa = 'NULL' elif clts_: sound = clts_.get(s.strip('^$'), False) if not sound: ipa = '!' + s.strip('^$') else: ipa = str(sound) else: ipa = s.strip('^$') yield s, ipa, examples_, languages, frequency, codepoints
def plot_heatmap( wordlist, filename="heatmap", fileformat="pdf", ref='cogid', normalized=False, refB='', **keywords ): """ Create a heatmap-representation of shared cognates for a given wordlist. Parameters ---------- wordlist : lingpy.basic.wordlist.Wordlist A Wordlist object containing cognate IDs. filename : str (default="heatmap") Name of the file to which the heatmap will be written. fileformat : str (default="pdf") A regular matplotlib-fileformat (pdf, png, pgf, svg). ref : str (default="cogid') The name of the column that contains the cognate identifiers. normalized : {bool str} (default=True) If set to c{False}, don't normalize the data. Otherwise, select the normalization method, choose between: * "jaccard" for the Jaccard-distance (see :evobib:`Bategelj1995` for details), and * "swadesh" for traditional lexicostatistical calculation of shared cognate percentages. cmap : matplotlib.cm (default=matplotlib.cm.jet) The color scheme to be used for the heatmap. steps : int (default=5) The number of steps in which names of taxa will be written to the axes. xrotation : int (default=45) The rotation of the taxon-names on the x-axis. colorbar : bool (default=True) Specify, whether a colorbar should be added to the plot. figsize : tuple (default=(10,10)) Specify the size of the figure. tree : str (default='') A tree passed for the taxa in Newick-format. If no tree is specified, the method looks for a tree object in the Wordlist. Notes ----- This function plots shared cognate percentages. """ defaults = dict( bottom=0.01, # rcParams['phybo_ylimb'] cmap=mpl.cm.jet, colorbar=True, colorbar_label="Shared Cognates", colorbar_shrink=0.75, colorbar_textsize=10, figsize=(10, 5), height=0.8, labels={}, # taxon labels passed for the taxa, left=0.01, # rcParams['phybo_xlimr'], matrix=False, normalization="jaccard", right=0.95, # rcParams['phybo_xliml'], scale=0.075, show_tree=True, steps=20, textsize=5, top=0.95, # rcParams['phybo_ylimt'], tree='', tree_bottom=0.1, tree_left=0.1, tree_width=0.2, vmax=1.0, vmin=0.0, width=0.8, xrotation=90, distances=False ) for k in defaults: if k not in keywords: keywords[k] = defaults[k] # access the reference tree of the wordlist and create a function that # orders the taxa accordingly if not keywords['tree']: try: tree = wordlist.tree except: raise ValueError("[i] No tree could be found") else: tree = keywords["tree"] # check for normalization if normalized: if normalized not in ["jaccard", "swadesh"]: raise ValueError( "Keyword 'normalized' must be one of 'jaccard','swadesh',False.") # create an empty matrix if not normalized: matrix = np.zeros((wordlist.width, wordlist.width), dtype=int) else: matrix = np.zeros((wordlist.width, wordlist.width), dtype=float) # create the figure fig = plt.figure(figsize=keywords['figsize']) # plot the reference tree if keywords['show_tree']: tree_matrix, taxa = nwk2tree_matrix(tree) ax1 = fig.add_axes( [ keywords['left'], keywords['bottom'], 0.25 * keywords['width'], keywords['height'] ] ) # [0.01,0.1,0.2,0.7]) d = sch.dendrogram( np.array(tree_matrix), labels=[t for t in taxa], orientation='left', ) taxa = d['ivl'][::-1] ax1.set_xticks([]) ax1.set_yticks([]) ax1.spines['bottom'].set_color('#ffffff') ax1.spines['top'].set_color('#ffffff') ax1.spines['left'].set_color('#ffffff') ax1.spines['right'].set_color('#ffffff') left = keywords['left'] + keywords['scale'] * keywords['width'] else: left = keywords['left'] taxa = tree.taxa # start iterating over taxa in order of the reference tree and fill in the # matrix with numbers of shared cognates if keywords['matrix']: matrix = keywords['matrix'] else: for i, taxonA in enumerate(taxa): for j, taxonB in enumerate(taxa): if i < j: if normalized in [False, "jaccard"]: cogsA = wordlist.get_list( taxa=taxonA, flat=True, entry=ref ) cogsB = wordlist.get_list( taxa=taxonB, flat=True, entry=ref ) cogsA, cogsB = set(cogsA), set(cogsB) shared = len(cogsA.intersection(cogsB)) if normalized: shared = shared / len(cogsA.union(cogsB)) else: cogsA = wordlist.get_dict( taxa=taxonA, entry=ref ) cogsB = wordlist.get_dict( taxa=taxonB, entry=ref ) shared = 0 slots = 0 # iterate over cognate sets in meaning slots for key in cogsA.keys(): # check whether keys are present, we follow the # STARLING procedure in ignoring missing data if key in cogsA and key in cogsB: # check for shared items if [k for k in cogsA[key] if k in cogsB[key]]: shared += 1 slots += 1 try: shared = shared / slots except ZeroDivisionError: log.warning(str( [shared, slots, len(cogsA), len(cogsB), taxonA, taxonB])) shared = 0.0 matrix[i][j] = shared # if refB is also a possibiltiy if not refB: matrix[j][i] = shared elif i > j and refB: if normalized in [False, "jaccard"]: cogsA = wordlist.get_list( taxa=taxonA, flat=True, entry=refB ) cogsB = wordlist.get_list( taxa=taxonB, flat=True, entry=refB ) cogsA, cogsB = set(cogsA), set(cogsB) shared = len(cogsA.intersection(cogsB)) if normalized: shared = shared / len(cogsA.union(cogsB)) else: cogsA = wordlist.get_dict( taxa=taxonA, entry=refB ) cogsB = wordlist.get_dict( taxa=taxonB, entry=refB ) shared = 0 slots = 0 # iterate over cognate sets in meaning slots for key in cogsA.keys(): # check whether keys are present, we follow the # STARLING procedure in ignoring missing data if key in cogsA and key in cogsB: # check for shared items if [k for k in cogsA[key] if k in cogsB[key]]: shared += 1 slots += 1 try: shared = shared / slots except ZeroDivisionError: log.warning(str( [shared, slots, len(cogsA), len(cogsB), taxonA, taxonB])) shared = 0.0 matrix[i][j] = shared elif i == j: cogs = wordlist.get_list( taxa=taxonA, flat=True, entry=ref ) if normalized: matrix[i][j] = 1.0 else: matrix[i][j] = len(set(cogs)) ax2 = fig.add_axes( [ left, # keywords['left']+0.25 * keywords['width']+0.05, keywords['bottom'], keywords['width'], keywords['height'] ] ) cmap = keywords['cmap'] # [0.15,0.1,0.7,0.7]) if 'distances' in keywords and keywords['distances']: for i, line in enumerate(matrix): for j, cell in enumerate(matrix): matrix[i][j] = 1 - matrix[i][j] nmatrix = [ [keywords['vmax'], keywords['vmin']], [keywords['vmin'], keywords['vmax']] ] im = ax2.matshow(nmatrix, aspect='auto', origin='lower', interpolation='nearest', cmap=keywords['cmap'], vmax=keywords['vmax'], vmin=keywords['vmin'] ) # set the xticks steps = int(len(taxa) / keywords['steps'] + 0.5) start = int(steps / 2 + 0.5) idxs = [0] + list(range(start, len(taxa), steps)) selected_taxa = [taxa[i] for i in idxs] # modify taxon names if this is specified for i, t in enumerate(selected_taxa): if t in keywords['labels']: selected_taxa[i] = keywords['labels'][t] ax2.set_xticks([]) ax2.set_yticks([]) plt.xticks( idxs, selected_taxa, size=keywords['textsize'], rotation=keywords['xrotation'], rotation_mode="default" ) plt.yticks( idxs, selected_taxa, size=keywords['textsize'], ) if keywords["colorbar"]: plt.imshow(matrix, cmap=keywords['cmap'], visible=False, vmax=keywords['vmax']) c = plt.colorbar(im, shrink=keywords['colorbar_shrink']) c.set_label(keywords["colorbar_label"], size=keywords['colorbar_textsize']) plt.subplots_adjust( left=keywords['left'], right=keywords['right'], top=keywords['top'], bottom=keywords['bottom'] ) plt.savefig(filename + '.' + fileformat) f = open(filename + '.matrix', 'w') for i, t in enumerate(taxa): f.write('{0:20}'.format(t)) for j, c in enumerate(matrix[i]): if not normalized: f.write('\t{0:3}'.format(int(c))) else: f.write('\t{0:.2f}'.format(c)) f.write('\n') f.close() log.file_written(filename + '.' + fileformat)