def renumber(wordlist, source, target='', override=False): """ Create numerical identifiers from string identifiers. """ # iterate over wordlist and get all source ids sources = sorted(set([ text_type(wordlist[k, source]) for k in wordlist])) # convert to numbers targets = list(range(1, len(sources) + 1)) # add to wordlist target = target or (source + 'id') # make converter converter = dict(zip(sources, targets)) # check for zero ids if 0 in converter: converter[0] = 0 if '' in converter: converter[''] = 0 wordlist.add_entries( target, source, lambda x: converter[text_type(x)], override=override) # add stuff to meta wordlist._meta[source + '2' + target] = converter log.info("Successfully renumbered {0}.".format(source))
def renumber(wordlist, source, target='', override=False): """ Create numerical identifiers from string identifiers. """ # iterate over wordlist and get all source ids sources = sorted(set([str(wordlist[k, source]) for k in wordlist])) # convert to numbers targets = list(range(1, len(sources) + 1)) # add to wordlist target = target or (source + 'id') # make converter converter = dict(zip(sources, targets)) # check for zero ids if 0 in converter: converter[0] = 0 if '' in converter: converter[''] = 0 wordlist.add_entries(target, source, lambda x: converter[str(x)], override=override) # add stuff to meta wordlist._meta[source + '2' + target] = converter log.info("Successfully renumbered {0}.".format(source))
def _get_colexifications(wordlist, entry='ipa', concept='concept', family='family'): """ Helper function computes colexifications for a given set of languages in a wordlist. """ if family not in wordlist.header: family = 'doculect' taxa = wordlist.cols colexifications = [] for taxon in taxa: log.info('Analyzing taxon {0}...'.format(taxon)) tmp_idxs = wordlist.get_list(taxon=taxon, flat=True) tmp_family = wordlist[tmp_idxs[0], family] tmp_concepts = wordlist.get_list(taxon=taxon, flat=True, entry=concept) tmp_entries = wordlist.get_list(taxon=taxon, flat=True, entry=entry) # iterate over all concepts and add them to the graph for (i, c1), (j, c2) in combinations2(enumerate(tmp_concepts)): if tmp_entries[i] == tmp_entries[j] and c1 != c2: colexifications += [(c1, c2, taxon, tmp_family, tmp_entries[i]) ] return colexifications
def test_convenience(): info('m') warning('m') debug('m') error('m') deprecated('o', 'n') missing_module('m') file_written('f')
def test_convenience(self): from lingpy.log import info, warn, debug, error, deprecated, missing_module, file_written info('m') warn('m') debug('m') error('m') deprecated('o', 'n') missing_module('m') file_written('f')
def compile_dvt(path=''): """ Function compiles diacritics, vowels, and tones. Notes ----- Diacritics, vowels, and tones are defined in the :file:`data/models/dv/` directory of the LingPy package and automatically loaded when loading the LingPy library. The values are defined as the constants :py:obj:`rcParams['vowels']`, :py:obj:`rcParams['diacritics']`, and :py:obj:`rcParams['tones']`. Their core purpose is to guide the tokenization of IPA strings (cf. :py:func:`~lingpy.sequence.sound_classes.ipa2tokens`). In order to change the variables, one simply has to change the text files :file:`diacritics`, :file:`tones`, and :file:`vowels` in the :file:`data/models/dv` directory. The structure of these files is fairly simple: Each line contains a vowel or a diacritic character, whereas diacritics are preceded by a dash. See also -------- lingpy.data.model.Model lingpy.data.derive.compile_model """ log.info("Compiling diacritics and vowels...") # get the path to the models if not path: file_path = util.data_path('models', 'dvt') elif path in ['evolaemp', 'el']: file_path = util.data_path('models', 'dvt_el') else: file_path = path def _read_string(name): # normalize stuff # TODO: this is potentially dangerous and it is important to decide whether # TODO: switching to NFD might not be a better choice return util.read_text_file(os.path.join(file_path, name), normalize='NFC').replace('\n', '') diacritics = _read_string('diacritics').replace('-', '') vowels = ''.join( [v for v in _read_string('vowels') if v not in diacritics]) tones = _read_string('tones') dvt = (diacritics, vowels, tones) if path in ['evolaemp', 'el']: cache.dump(dvt, 'dvt_el') else: cache.dump(dvt, 'dvt') log.info("Diacritics and sound classes were successfully compiled.")
def compile_dvt(path=''): """ Function compiles diacritics, vowels, and tones. Notes ----- Diacritics, vowels, and tones are defined in the :file:`data/models/dv/` directory of the LingPy package and automatically loaded when loading the LingPy library. The values are defined as the constants :py:obj:`rcParams['vowels']`, :py:obj:`rcParams['diacritics']`, and :py:obj:`rcParams['tones']`. Their core purpose is to guide the tokenization of IPA strings (cf. :py:func:`~lingpy.sequence.sound_classes.ipa2tokens`). In order to change the variables, one simply has to change the text files :file:`diacritics`, :file:`tones`, and :file:`vowels` in the :file:`data/models/dv` directory. The structure of these files is fairly simple: Each line contains a vowel or a diacritic character, whereas diacritics are preceded by a dash. See also -------- lingpy.data.model.Model lingpy.data.derive.compile_model """ log.info("Compiling diacritics and vowels...") # get the path to the models if not path: file_path = util.data_path('models', 'dvt') elif path in ['evolaemp', 'el']: file_path = util.data_path('models', 'dvt_el') else: file_path = path def _read_string(name): # normalize stuff # TODO: this is potentially dangerous and it is important to decide whether # TODO: switching to NFD might not be a better choice return util.read_text_file( os.path.join(file_path, name), normalize='NFC').replace('\n', '') diacritics = _read_string('diacritics').replace('-', '') vowels = ''.join([v for v in _read_string('vowels') if v not in diacritics]) tones = _read_string('tones') dvt = (diacritics, vowels, tones) if path in ['evolaemp', 'el']: cache.dump(dvt, 'dvt_el') else: cache.dump(dvt, 'dvt') log.info("Diacritics and sound classes were successfully compiled.")
def songbook(self, *poems, filename='poems.tex'): """Export songs to latex songbook""" text = "" meta = self._meta.get('poems', {}) if poems[0] == '*': poems = self.cols for poem in poems: log.info('analyzing poem' + poem) idxs = sorted(self.get_list(col=poem, flat=True), key=lambda x: (self[x, 'stanza'], self[x, 'line_order'])) text += r'\begin{{song}}{{{title}}}{{}}{{}}{{{author}}}{{}}{{}}'.format( title=meta.get(poem, {}).get('title'), author=meta.get(poem, {}).get('author')) + '\n' stanza, before = '', '' for idx in idxs: new_stanza, refrain = self[idx, 'stanza'], self[idx, 'refrain'] if new_stanza != stanza: if before == 'verse': text += r'\end{SBVerse}' + '\n' elif before == 'refrain': text += r'\end{SBChorus}' + '\n' if refrain: text += r'\begin{SBChorus}' + '\n' before = 'refrain' else: before = 'verse' text += r'\begin{SBVerse}' + '\n' stanza = new_stanza line = [] for word, chords in zip(self[idx, 'line'].n, self[idx, 'chords']): small_line = [] for syl, chord in zip(word, chords.split()): syl = syl.replace('_', ' ') if chord.strip('_').strip(): small_line += [r'\Ch{' + chord + '}{' + syl + '}'] else: small_line += [syl] line += ['-'.join(small_line)] text += ' '.join(line) + '\n\n' if refrain: text += r'\end{SBChorus}' + '\n' else: text += r'\end{SBVerse}' + '\n' text += r'\end{song}' + '\n\n' pathlib.Path(filename).write_text(text, encoding='utf8') log.info('wrote file {0}'.format(filename))
def _import_sound_classes(filename): """ Function imports individually defined sound classes from a text file and creates a replacement dictionary from these sound classes. """ sc_repl_dict = {} errors = [] for key, values in _read(filename, normalize='NFC').items(): for value in values: log.info('%s' % ((value, key),)) if value in sc_repl_dict and sc_repl_dict[value] != key: errors += [value] sc_repl_dict[value] = key if errors: raise ValueError("Values {0} in file {1} are multiply defined!".format( ' // '.join(sorted(set(errors))), filename)) return sc_repl_dict
def _import_sound_classes(filename): """ Function imports individually defined sound classes from a text file and creates a replacement dictionary from these sound classes. """ sc_repl_dict = {} errors = [] for key, values in _read(filename, normalize='NFC').items(): for value in values: log.info('%s' % ((value, key), )) if value in sc_repl_dict and sc_repl_dict[value] != key: errors += [value] sc_repl_dict[value] = key if errors: raise ValueError("Values {0} in file {1} are multiply defined!".format( ' // '.join(sorted(set(errors))), filename)) return sc_repl_dict
def _get_colexifications(wordlist, entry='ipa', concept='concept', family='family'): """ Helper function computes colexifications for a given set of languages in a wordlist. """ if family not in wordlist.header: family = 'doculect' taxa = wordlist.cols colexifications = [] for taxon in taxa: log.info('Analyzing taxon {0}...'.format(taxon)) tmp_idxs = wordlist.get_list(taxon=taxon, flat=True) tmp_family = wordlist[tmp_idxs[0], family] tmp_concepts = wordlist.get_list(taxon=taxon, flat=True, entry=concept) tmp_entries = wordlist.get_list(taxon=taxon, flat=True, entry=entry) # iterate over all concepts and add them to the graph for (i, c1), (j, c2) in combinations2(enumerate(tmp_concepts)): if tmp_entries[i] == tmp_entries[j] and c1 != c2: colexifications += [(c1, c2, taxon, tmp_family, tmp_entries[i])] return colexifications
def compile_model(model, path=None): """ Function compiles customized sound-class models. Parameters ---------- model : str A string indicating the name of the model which shall be created. path : str A string indication the path where the model-folder is stored. Notes ----- A model is defined by a folder placed in :file:`data/models` directory of the LingPy package. The name of the folder reflects the name of the model. It contains three files: the file :file:`converter`, the file :file:`INFO`, and the optional file :file:`scorer`. The format requirements for these files are as follows: :file:`INFO` The ``INFO``-file serves as a reference for a given sound-class model. It can contain arbitrary information (and also be empty). If one wants to define specific characteristics, like the ``source``, the ``compiler``, the ``date``, or a ``description`` of a given model, this can be done by employing a key-value structure in which the key is preceded by an ``@`` and followed by a colon and the value is written right next to the key in the same line, e.g.:: @source: Dolgopolsky (1986) This information will then be read from the ``INFO`` file and rendered when printing the model to screen with help of the :py:func:`print` function. :file:`converter` The ``converter`` file contains all sound classes which are matched with their respective sound values. Each line is reserved for one class, precede by the key (preferably an ASCII-letter) representing the class:: B : ɸ, β, f, p͡f, p͜f, ƀ E : ɛ, æ, ɜ, ɐ, ʌ, e, ᴇ, ə, ɘ, ɤ, è, é, ē, ě, ê, ɚ D : θ, ð, ŧ, þ, đ G : x, ɣ, χ ... :file:`matrix` A scoring matrix indicating the alignment scores of all sound-class characters defined by the model. The scoring is structured as a simple tab-delimited text file. The first cell contains the character names, the following cells contain the scores in redundant form (with both triangles being filled):: B 10.0 -10.0 5.0 ... E -10.0 5.0 -10.0 ... F 5.0 -10.0 10.0 ... ... :file:`scorer` The ``scorer`` file (which is optional) contains the graph of class-transitions which is used for the calculation of the scoring dictionary. Each class is listed in a separate line, followed by the symbols ``v``,``c``, or ``t`` (indicating whether the class represents vowels, consonants, or tones), and by the classes it is directly connected to. The strength of this connection is indicated by digits (the smaller the value, the shorter the path between the classes):: A : v, E:1, O:1 C : c, S:2 B : c, W:2 E : v, A:1, I:1 D : c, S:2 ... The information in such a file is automatically converted into a scoring dictionary (see :evobib:`List2012b` for details). Based on the information provided by the files, a dictionary for the conversion of IPA-characters to sound classes and a scoring dictionary are created and stored as a binary. The model can be loaded with help of the :py:class:`~lingpy.data.model.Model` class and used in the various classes and functions provided by the library. See also -------- lingpy.data.model.Model compile_dvt """ log.info("Compiling model <" + model + ">...") # get the path to the models new_path = lambda *cmps: os.path.join(path or util.data_path('models'), model, *cmps) log.debug("Model-Path: %s" % new_path) # load the sound classes sound_classes = _import_sound_classes(new_path('converter')) # dump the data cache.dump(sound_classes, model + '.converter') log.info("... successfully created the converter.") # try to load the scoring function or the score tree scorer = False if os.path.isfile(new_path('matrix')): scorer = read_scorer(new_path('matrix')) elif os.path.isfile(new_path('scorer')): score_tree = _import_score_tree(new_path('scorer')) # calculate the scoring dictionary score_dict = _make_scoring_dictionary(score_tree) # make score_dict a ScoreDict instance chars = sorted(set([s[0] for s in score_dict.keys()])) matrix = [[0 for i in range(len(chars))] for j in range(len(chars))] for (i, charA), (j, charB) in util.multicombinations2(enumerate(chars)): if i < j: matrix[i][j] = score_dict.get((charA, charB), -100) matrix[j][i] = score_dict.get((charB, charA), -100) elif i == j: matrix[i][j] = score_dict[charA, charB] scorer = misc.ScoreDict(chars, matrix) util.write_text_file(new_path('matrix'), scorer2str(scorer)) if scorer: cache.dump(scorer, model + '.scorer') log.info("... successfully created the scorer.") else: log.info("... no scoring dictionary defined.") log.info("Model <" + model + "> was compiled successfully.")
def star2qlc(filename, clean_taxnames=False, debug=False): """ Converts a file directly output from starling to LingPy-QLC format. """ cleant = clean_taxnames or identity data = csv2list(filename) # check for strange chars in data due to notepad errors data[0][0] = data[0][0].replace('\ufeff', '') # get the header header = data[0] # debugging if debug: error = False log.info("Header line has length {0}.".format(len(header))) for line in data[1:]: if len(line) != len(header): # pragma: no cover log.error( "Error for item {0} with length {1}, expected {2}.".format( '/'.join(line[0:2]), len(line), len(header))) error = True if error: # pragma: no cover log.error("Errors were found, aborting function call.") return else: log.info("Everything went fine, carrying on with function call.") # determine language names in header taxa = [] for i in range(len(header) - 1): prev = header[i] post = header[i + 1] if prev in post and '#' in post: taxa += [prev] if len(taxa) == 1: lngIdx = i if prev == 'Number': numIdx = i if prev == 'Word': wrdIdx = i log.info('starling, indices (%s, %s, %s)' % (lngIdx, numIdx, wrdIdx)) log.info('starling, taxa: %s' % taxa) # start filling in the dictionary D = { 0: [ 'DOCULECT', 'CONCEPT', 'GLOSSID', 'WORDINSOURCE', 'ORTHOGRAPHY', 'IPA', 'COGID' ] } idx = 1 cognate_counter = 0 current_concept = '' cognate_sets = [] for line in data[2:]: gloss = line[wrdIdx] gnum = line[numIdx] # switch to next cognate set if there is a switch in concepts if current_concept != gloss and len(cognate_sets) != 0: max_cog = max(cognate_sets) cognate_counter = max_cog cognate_sets = [] current_concept = gloss else: log.debug('starling, indices (%s, %s, %s)' % (gloss, current_concept, cognate_counter)) for i in range(lngIdx, len(header), 2): word = line[i] if '{' in word: ipa = word[:word.index('{')].strip() ortho = word[word.index('{') + 1:word.index('}')].strip() else: ipa = word ortho = word cogid = int(line[i + 1]) if cogid != 0 and word: if cogid > 0: cogid = cogid + cognate_counter # append cognate sets, essential for raising the counter cognate_sets += [int(cogid)] taxon = cleant(header[i]) D[idx] = [taxon, gloss, gnum, word, ortho, ipa, cogid] idx += 1 # re-iterate through data and reassign cognate sets with negative ids for k in D: if k: cogid = D[k][-1] if cogid < 0: cogid = -cognate_counter cognate_counter += 1 D[k][-1] = cogid return D
def rc(rval=None, **keywords): """ Function changes parameters globally set for LingPy sessions. Parameters ---------- rval : string (default=None) Use this keyword to specify a return-value for the rc-function. schema : {"ipa", "asjp"} Change the basic schema for sequence comparison. When switching to "asjp", this means that sequences will be treated as sequences in ASJP code, otherwise, they will be treated as sequences written in basic IPA. Notes ----- This function is the standard way to communicate with the *rcParams* dictionary which is not imported as a default. If you want to see which parameters there are, you can load the rcParams dictonary directly:: >>> from lingpy.settings import rcParams However, be careful when changing the values. They might produce some unexpected behavior. Examples -------- Import LingPy: >>> from lingpy import * Switch from IPA transcriptions to ASJP transcriptions: >>> rc(schema="asjp") You can check which "basic orthography" is currently loaded: >>> rc(basic_orthography) 'asjp' >>> rc(schema='ipa') >>> rc(basic_orthography) 'fuzzy' """ from lingpy import log if rval: return rcParams[rval] for key in keywords: if key == "schema": if keywords[key] in ["qlc", 'ipa']: diacritics, vowels, tones = load_dvt(path='') rcParams['asjp'] = Model('asjp') rcParams['sca'] = Model('sca') rcParams['dolgo'] = Model('dolgo') rcParams['art'] = Model('art') rcParams['diacritics'] = diacritics rcParams['vowels'] = vowels rcParams['tones'] = tones rcParams['_color'] = Model('color') rcParams['combiners'] = '\u0361\u035c' rcParams['breaks'] = '.-' rcParams['stress'] = "ˈˌ'" rcParams['merge_vowels'] = True rcParams['basic_orthography'] = 'fuzzy' # reset basic model to sca rcParams['model'] = rcParams['sca'] elif keywords[key] in ['evolaemp', 'el', 'asjp']: diacritics, vowels, tones = load_dvt(path='el') rcParams['asjp'] = Model('asjp_el') rcParams['sca'] = Model('sca_el') rcParams['dolgo'] = Model('dolgo_el') rcParams['art'] = Model('art_el') rcParams['jaeger'] = Model('jaeger_el') rcParams['diacritics'] = diacritics rcParams['vowels'] = vowels rcParams['tones'] = tones rcParams['_color'] = Model('color_el') rcParams['combiners'] = '\u0361\u035c' rcParams['breaks'] = '.-' rcParams['stress'] = "ˈˌ'" rcParams['merge_vowels'] = False rcParams['basic_orthography'] = 'asjp' # reset the basic model to the asjp model rcParams['model'] = rcParams['asjp'] if key in alias: rcParams[alias[key]] = keywords[key] else: rcParams[key] = keywords[key] log.info("Successfully changed parameters.")
def tstats(wordlist, glm='', network=False, acs=False, tree=False, singletons=True, return_dists=False): """ Calculate transmission statistics for a given MLN. """ # check for attributes # return if no glm and no network is given if not glm and not network: raise ValueError( "You must specify at least one network or a gain-loss model.") # check for acs and network if glm: # network = wordlist.graph[glm] acs = wordlist.acs[glm] # check for tree if not tree: tree = wordlist.tree # add the distributions of the leaves to the acs for t in tree.taxa: paps = wordlist.get_list(taxa=t, entry='pap', flat=True) cons = wordlist.get_list(taxa=t, entry='concept', flat=True) acs[t] = [(p, c) for p, c in zip(paps, cons)] # now we apply a simple way to resolve directions by taking the first # occurence of links in the tree to be the innovation, and all dependent # links to be the source of borrowings # create a queue queue = ['root'] # make dictionary of innovated chars: these are currently all present in the # root, we order list as [inheritance,innovation,transfer] tracer = dict([(c[0], [0, 1, 0]) for c in acs['root']]) states = {} # start to iterate while queue: # get current node node = queue.pop(0) # get the children children = tree.getNodeMatchingName(node).Children # get the chars of the node node_chars = list(set([c[0] for c in acs[node]])) # if there are children for child in children: # get the node name name = child.Name # append name to the queue queue += [name] # get the chars of the child chars = list(set([c[0] for c in acs[name]])) inn = 0 ret = 0 bor = 0 # iterate over chars and decide where they come from for char in chars: if char not in wordlist.singletons or not singletons: # if char is inherited, increase the score if char in node_chars: tracer[char][0] += 1 ret += 1 # if occurs the first time, it is an innovation elif char not in tracer: tracer[char] = [0, 1, 0] inn += 1 # if it is in the tracer elif char not in node_chars and char in tracer: tracer[char][2] += 1 bor += 1 states[name] = [ret, inn, bor] # calculate the scores ret = sum([c[0] for c in tracer.values()]) inn = sum([c[1] for c in tracer.values()]) tra = sum([c[2] for c in tracer.values()]) ipn = inn / len(acs) tpn = tra / len(acs) total2 = ipn + tpn log.info("Innovations: {0}, {1:.2f}, {2:.2f}".format( inn, ipn, ipn / total2)) log.info("Transferred: {0}, {1:.2f}, {2:.2f}".format( tra, tpn, tpn / total2)) if return_dists: leaves = [] nodes = [] for node in [n for n in tree.getNodeNames() if n != 'root']: innovations = states[node][1] + states[node][2] if node in tree.taxa: leaves += [innovations] else: nodes += [innovations] # evaluate using mwu p, r = sps.mstats.kruskalwallis(leaves, nodes) return p, r return inn, tra, tracer
def rc(rval=None, **keywords): """ Function changes parameters globally set for LingPy sessions. Parameters ---------- rval : string (default=None) Use this keyword to specify a return-value for the rc-function. schema : {"ipa", "asjp"} Change the basic schema for sequence comparison. When switching to "asjp", this means that sequences will be treated as sequences in ASJP code, otherwise, they will be treated as sequences written in basic IPA. verbose : bool (default=False) Use this keyword in order to switch to verbose output. This will be useful when using complex methods, in order to understand what the program is actually doing. debug : bool (default=False) Use this keyword to switch to debug-mode. It will give specific, internal output that is much more technical than the output resulting from "verbose". Notes ----- This function is the standard way to communicate with the *rcParams* dictionary which is not imported as a default. If you want to see which parameters there are, you can load the rcParams dictonary directly:: >>> from lingpy.settings import rcParams However, be careful when changing the values. They might produce some unexpected behavior. Examples -------- Import LingPy: >>> from lingpy import * Change basic values. Switch to verbose output, for example: >>> rc(verbose=True) Successfully changed parameters. """ from lingpy import log if rval: return rcParams[rval] for key in keywords: if key == "schema": if keywords[key] in ["qlc", 'ipa']: diacritics, vowels, tones = load_dvt(path='') rcParams['asjp'] = Model('asjp') rcParams['sca'] = Model('sca') rcParams['dolgo'] = Model('dolgo') rcParams['art'] = Model('art') rcParams['diacritics'] = diacritics rcParams['vowels'] = vowels rcParams['tones'] = tones rcParams['_color'] = Model('color') rcParams['combiners'] = '\u0361\u035c' rcParams['breaks'] = '.-' rcParams['stress'] = "ˈˌ'" rcParams['merge_vowels'] = True rcParams['basic_orthography'] = 'fuzzy' # reset basic model to sca rcParams['model'] = rcParams['sca'] elif keywords[key] in ['evolaemp', 'el', 'asjp']: diacritics, vowels, tones = load_dvt(path='el') rcParams['asjp'] = Model('asjp_el') rcParams['sca'] = Model('sca_el') rcParams['dolgo'] = Model('dolgo_el') rcParams['art'] = Model('art_el') rcParams['jaeger'] = Model('jaeger_el') rcParams['diacritics'] = diacritics rcParams['vowels'] = vowels rcParams['tones'] = tones rcParams['_color'] = Model('color_el') rcParams['combiners'] = '\u0361\u035c' rcParams['breaks'] = '.-' rcParams['stress'] = "ˈˌ'" rcParams['merge_vowels'] = False rcParams['basic_orthography'] = 'asjp' # reset the basic model to the asjp model rcParams['model'] = rcParams['asjp'] if key in alias: rcParams[alias[key]] = keywords[key] else: rcParams[key] = keywords[key] log.info("Successfully changed parameters.")
def _get_partial_randist(self, **keywords): """ Return the aligned results of randomly aligned sequences. """ kw = dict(modes=rcParams['lexstat_modes'], factor=rcParams['align_factor'], restricted_chars=rcParams['restricted_chars'], runs=rcParams['lexstat_runs'], rands=rcParams['lexstat_rands'], limit=rcParams['lexstat_limit'], method=rcParams['lexstat_scoring_method']) kw.update(keywords) # determine the mode method = 'markov' if kw['method'] in ['markov', 'markov-chain', 'mc'] \ else 'shuffle' corrdist = {} tasks = (self.width**2) / 2 with util.pb(desc='RANDOM CORRESPONDENCE CALCULATION', total=tasks) as progress: for (i, tA), (j, tB) in util.multicombinations2(enumerate(self.cols)): progress.update(1) log.info("Calculating random alignments" "for pair {0}/{1}.".format(tA, tB)) corrdist[tA, tB] = defaultdict(float) # create morpheme-segmented pairs pairs = self.pairs[tA, tB] new_nums, new_weights, new_pros = [], [], [] for idxA, idxB in pairs: for iA, iB in self._slices[idxA]: for jA, jB in self._slices[idxB]: new_nums += [(self[idxA, self._numbers][iA:iB], self[idxB, self._numbers][jA:jB])] new_weights += [(self[idxA, self._weights][iA:iB], self[idxB, self._weights][jA:jB])] new_pros += [(self[idxA, self._prostrings][iA:iB], self[idxB, self._prostrings][jA:jB])] # get the number pairs etc. sample = [(x, y) for x in range(len(new_nums)) for y in range(len(new_nums))] if len(sample) > kw['runs']: sample = random.sample(sample, kw['runs']) for mode, gop, scale in kw['modes']: corrs, included = calign.corrdist( 10.0, [(new_nums[s[0]][0], new_nums[s[1]][1]) for s in sample], [(new_weights[s[0]][0], new_weights[s[1]][1]) for s in sample], [(new_pros[s[0]][0], new_pros[s[1]][1]) for s in sample], gop, scale, kw['factor'], self.bscorer, mode, kw['restricted_chars']) # change representation of gaps for a, b in list(corrs.keys()): # get the correspondence count d = corrs[a, b] * self._included[tA, tB] / included # XXX check XXX* len(self.pairs[tA,tB]) / runs # check for gaps if a == '-': a = util.charstring(i + 1) elif b == '-': b = util.charstring(j + 1) corrdist[tA, tB][a, b] += d / len(kw['modes']) return corrdist
def context_profile(wordlist, ref='ipa', col="doculect", semi_diacritics='hsʃ̢ɕʂʐʑʒw', merge_vowels=False, brackets=None, splitters='/,;~', merge_geminates=True, clts=False, bad_word="<???>", bad_sound="<?>", unknown_sound="!{0}", examples=2, max_entries=100): """ Create an advanced Orthography Profile with context and doculect information. Parameters ---------- wordlist : ~lingpy.basic.wordlist.Wordlist A wordlist from which you want to derive an initial orthography profile. ref : str (default="ipa") The name of the reference column in which the words are stored. col : str (default="doculect") Indicate in which column the information on the language variety is stored. semi_diacritics : str Indicate characters which can occur both as "diacritics" (second part in a sound) or alone. merge_vowels : bool (default=True) Indicate whether consecutive vowels should be merged. brackets : dict A dictionary with opening brackets as key and closing brackets as values. Defaults to a pre-defined set of frequently occurring brackets. splitters : str The characters which force the automatic splitting of an entry. clts : dict (default=None) A dictionary(like) object that converts a given source sound into a potential target sound, using the get()-method of the dictionary. Normally, we think of a CLTS instance here (that is: a cross-linguistic transcription system as defined in the pyclts package). bad_word : str (default="«???»") Indicate how words that could not be parsed should be handled. Note that both "bad_word" and "bad_sound" are format-strings, so you can add formatting information here. bad_sound : str (default="«?»") Indicate how sounds that could not be converted to a sound class be handled. Note that both "bad_word" and "bad_sound" are format-strings, so you can add formatting information here. unknown_sound : str (default="!{0}") If with_clts is set to True, use this string to indicate that sounds are classified as "unknown sound" in the CLTS framework. examples : int(default=2) Indicate the number of examples that should be printed out. Returns ------- profile : generator A generator of tuples (three items), indicating the segment, its frequency, the conversion to sound classes in the Dolgopolsky sound-class model, and the unicode-codepoints. """ clts_ = clts or {} nulls = set() bad_words = set() brackets = brackets or "([{『(₍⁽«)]})』⁾₎" profile = defaultdict(list) errors = set() for idx, word, language in pb(wordlist.iter_rows(ref, col), desc='iter words', total=len(wordlist)): log.info('processing {0}-{1}'.format(idx, word)) if isinstance(word, list): word = ' '.join(word) if word.strip(): try: cleaned_string = clean_string( word, semi_diacritics=semi_diacritics, merge_vowels=merge_vowels, brackets=None, ignore_brackets=False, split_entries=False, preparse=None, rules=None, merge_geminates=merge_geminates)[0].split(' ') # retain whole word if there are splitters in the word if [x for x in cleaned_string if x in brackets + splitters]: profile[word] += [(language, word)] bad_words.add(word) else: context_pre = ['^'] + (len(cleaned_string) - 1) * [''] context_post = (len(cleaned_string) - 1) * [''] + ['$'] for ctxA, ctxB, segment in zip(context_pre, context_post, cleaned_string): profile[ctxA + segment + ctxB] += [(language, word)] for segment in [ x for x in word if x not in ' '.join(cleaned_string) ]: profile[segment] += [(language, word)] nulls.add(segment) except: errors.add(idx) log.warn('problem parsing {0}'.format(word)) for s in '^$': yield s, 'NULL', '', '', '', '' for idx, (s, entries) in pb(enumerate( sorted(profile.items(), key=lambda x: len(x[1]), reverse=True)), desc='yielding entries', total=len(profile)): sclass = token2class(s.strip('^$'), 'dolgo') words, langs = [l[1] for l in entries ][:max_entries], [l[0] for l in entries][:max_entries] languages = ', '.join( sorted(set(langs), key=lambda x: langs.count(x), reverse=True)) frequency = str(len(langs)) codepoints = codepoint(s) examples_ = ', '.join( sorted(set(words), key=lambda x: words.count(x), reverse=True)[:examples]) if s in bad_words: ipa = bad_word.format(s) elif sclass == '0': ipa = bad_sound.format(s) elif s in nulls: ipa = 'NULL' elif clts_: sound = clts_.get(s.strip('^$'), False) if not sound: ipa = '!' + s.strip('^$') else: ipa = text_type(sound) else: ipa = s.strip('^$') yield s, ipa, examples_, languages, frequency, codepoints
def _get_partial_corrdist(self, **keywords): """ Use alignments to get a correspondences statistics. """ kw = dict( cluster_method='upgma', factor=rcParams['align_factor'], gop=rcParams['align_gop'], modes=rcParams['lexstat_modes'], preprocessing=False, preprocessing_method=rcParams['lexstat_preprocessing_method'], preprocessing_threshold=rcParams[ 'lexstat_preprocessing_threshold'], split_on_tones=False, ref='scaid', restricted_chars=rcParams['restricted_chars'], threshold=rcParams['lexstat_scoring_threshold'], subset=False) kw.update(keywords) self._included = {} corrdist = {} if kw['preprocessing']: if kw['ref'] not in self.header: self.cluster(method=kw['preprocessing_method'], threshold=kw['preprocessing_threshold'], gop=kw['gop'], cluster_method=kw['cluster_method'], ref=kw['ref']) with util.pb(desc='CORRESPONDENCE CALCULATION', total=self.width**2 / 2) as pb: for (i, tA), (j, tB) in util.multicombinations2(enumerate(self.cols)): pb.update(1) log.info("Calculating alignments for pair {0} / {1}.".format( tA, tB)) corrdist[tA, tB] = defaultdict(float) for mode, gop, scale in kw['modes']: pairs = self.pairs[tA, tB] if kw['subset']: pairs = [ pair for pair in pairs if pair in self.subsets[tA, tB] ] # threshold and preprocessing, make sure threshold is # different from pre-processing threshold when # preprocessing is set to false if kw['preprocessing']: pairs = [ pair for pair in pairs if self[pair, kw['ref']][0] == self[pair, kw['ref']][1] ] threshold = 10.0 else: threshold = kw['threshold'] # create morpheme-segmented pairs new_nums, new_weights, new_pros = [], [], [] for idxA, idxB in pairs: for iA, iB in self._slices[idxA]: for jA, jB in self._slices[idxB]: new_nums += [(self[idxA, self._numbers][iA:iB], self[idxB, self._numbers][jA:jB])] new_weights += [(self[idxA, self._weights][iA:iB], self[idxB, self._weights][jA:jB])] new_pros += [(self[idxA, self._prostrings][iA:iB], self[idxB, self._prostrings][jA:jB])] corrs, self._included[tA, tB] = calign.corrdist( threshold, new_nums, new_weights, new_pros, gop, scale, kw['factor'], self.bscorer, mode, kw['restricted_chars']) # change representation of gaps for (a, b), d in corrs.items(): # XXX check for bias XXX if a == '-': a = util.charstring(i + 1) elif b == '-': b = util.charstring(j + 1) corrdist[tA, tB][a, b] += d / float(len(kw['modes'])) return corrdist
def find_threshold(matrix, thresholds=[i * 0.05 for i in range(1, 19)][::-1], logs=True): """ Use a variant of the method by :evobib:`Apeltsin2011` in order to find an optimal threshold. Parameters ---------- matrix : list The distance matrix for which the threshold shall be determined. thresholds : list (default=[i*0.05 for i in range(1,19)[::-1]) The range of thresholds that shall be tested. logs : {bool,builtins.function} (default=True) If set to **True**, the logarithm of the score beyond the threshold will be assigned as weight to the graph. If set to c{False} all weights will be set to 1. Use a custom function to define individual ways to calculate the weights. Returns ------- threshold : {float,None} If a float is returned, this is the threshold identified by the method. If **None** is returned, no threshold could be identified. Notes ----- This is a very simple method that may not work well depending on the dataset. So we recommend to use it with great care. """ # get the old degree of the matrix odeg = _get_wad(matrix, 1) # store the plateaus (where nothing changes in the network) plato = {0: [1.0]} # this is the current index of the last plateau ci = 0 minc = 0 alls = [] # start iterating and calculating for i, t in enumerate(thresholds[1:], 1): # get the new degree of the matrix under threshold t ndeg = _get_wad(matrix, t, logs) # if there is a new degree if ndeg: # get the change in comparison with the old degree cdeg = ndeg - odeg if cdeg < minc: minc = cdeg # swap old degree to new degree odeg = ndeg # if there's a plateau, the changed degree should be equal or # greater zero if cdeg >= 0: plato[ci] += [t] else: plato[i] = [t] ci = i alls += [(t, ndeg)] # try to find the plateau of maximal length sorted_plato = sorted(plato, key=lambda x: len(plato[x]), reverse=True) log.info('Found {0} thresholds.'.format( len([p for p in plato if len(plato[p]) > 1]))) log.info('... %s' % (sorted([len(plato[p]) for p in plato], reverse=True), )) # check if first entry is NOT of length 1 try: return [ sum(plato[t]) / len(plato[t]) for t in sorted_plato if len(plato[t]) > 1 ][0] except: return
def star2qlc(filename, clean_taxnames=False, debug=False): """ Converts a file directly output from starling to LingPy-QLC format. """ cleant = clean_taxnames or identity data = csv2list(filename) # check for strange chars in data due to notepad errors data[0][0] = data[0][0].replace('\ufeff', '') # get the header header = data[0] # debugging if debug: error = False log.info("Header line has length {0}.".format(len(header))) for line in data[1:]: if len(line) != len(header): # pragma: no cover log.error("Error for item {0} with length {1}, expected {2}.".format( '/'.join(line[0:2]), len(line), len(header))) error = True if error: # pragma: no cover log.error("Errors were found, aborting function call.") return else: log.info("Everything went fine, carrying on with function call.") # determine language names in header taxa = [] for i in range(len(header) - 1): prev = header[i] post = header[i + 1] if prev in post and '#' in post: taxa += [prev] if len(taxa) == 1: lngIdx = i if prev == 'Number': numIdx = i if prev == 'Word': wrdIdx = i log.info('starling, indices (%s, %s, %s)' % (lngIdx, numIdx, wrdIdx)) log.info('starling, taxa: %s' % taxa) # start filling in the dictionary D = {0: [ 'DOCULECT', 'CONCEPT', 'GLOSSID', 'WORDINSOURCE', 'ORTHOGRAPHY', 'IPA', 'COGID']} idx = 1 cognate_counter = 0 current_concept = '' cognate_sets = [] for line in data[2:]: gloss = line[wrdIdx] gnum = line[numIdx] # switch to next cognate set if there is a switch in concepts if current_concept != gloss and len(cognate_sets) != 0: max_cog = max(cognate_sets) cognate_counter = max_cog cognate_sets = [] current_concept = gloss else: if debug: print(gloss, current_concept, cognate_counter) for i in range(lngIdx, len(header), 2): word = line[i] if '{' in word: ipa = word[:word.index('{')].strip() ortho = word[word.index('{') + 1:word.index('}')].strip() else: ipa = word ortho = word cogid = int(line[i + 1]) if cogid != 0 and word: if cogid > 0: cogid = cogid + cognate_counter else: pass # append cognate sets, essential for raising the counter cognate_sets += [int(cogid)] taxon = cleant(header[i]) D[idx] = [taxon, gloss, gnum, word, ortho, ipa, cogid] idx += 1 # re-iterate through data and reassign cognate sets with negative ids for k in D: if k: cogid = D[k][-1] if cogid < 0: cogid = -cognate_counter cognate_counter += 1 D[k][-1] = cogid return D
def find_threshold(matrix, thresholds=[i * 0.05 for i in range(1, 19)][::-1], logs=True): """ Use a variant of the method by :evobib:`Apeltsin2011` in order to find an optimal threshold. Parameters ---------- matrix : list The distance matrix for which the threshold shall be determined. thresholds : list (default=[i*0.05 for i in range(1,19)[::-1]) The range of thresholds that shall be tested. logs : {bool,builtins.function} (default=True) If set to **True**, the logarithm of the score beyond the threshold will be assigned as weight to the graph. If set to c{False} all weights will be set to 1. Use a custom function to define individual ways to calculate the weights. Returns ------- threshold : {float,None} If a float is returned, this is the threshold identified by the method. If **None** is returned, no threshold could be identified. Notes ----- This is a very simple method that may not work well depending on the dataset. So we recommend to use it with great care. """ # get the old degree of the matrix odeg = _get_wad(matrix, 1) # store the plateaus (where nothing changes in the network) plato = {0: [1.0]} # this is the current index of the last plateau ci = 0 minc = 0 alls = [] # start iterating and calculating for i, t in enumerate(thresholds[1:], 1): # get the new degree of the matrix under threshold t ndeg = _get_wad(matrix, t, logs) # if there is a new degree if ndeg: # get the change in comparison with the old degree cdeg = ndeg - odeg if cdeg < minc: minc = cdeg # swap old degree to new degree odeg = ndeg # if there's a plateau, the changed degree should be equal or # greater zero if cdeg >= 0: plato[ci] += [t] else: plato[i] = [t] ci = i alls += [(t, ndeg)] # try to find the plateau of maximal length sorted_plato = sorted(plato, key=lambda x: len(plato[x]), reverse=True) log.info('Found {0} thresholds.'.format(len([p for p in plato if len(plato[p]) > 1]))) log.info('... %s' % (sorted([len(plato[p]) for p in plato], reverse=True),)) # check if first entry is NOT of length 1 try: return [sum(plato[t]) / len(plato[t]) for t in sorted_plato if len(plato[t]) > 1][0] except: return
def tstats( wordlist, glm='', network=False, acs=False, tree=False, singletons=True, return_dists=False ): """ Calculate transmission statistics for a given MLN. """ # check for attributes # return if no glm and no network is given if not glm and not network: raise ValueError("You must specify at least one network or a gain-loss model.") # check for acs and network if glm: # network = wordlist.graph[glm] acs = wordlist.acs[glm] # check for tree if not tree: tree = wordlist.tree # add the distributions of the leaves to the acs for t in tree.taxa: paps = wordlist.get_list(taxa=t, entry='pap', flat=True) cons = wordlist.get_list(taxa=t, entry='concept', flat=True) acs[t] = [(p, c) for p, c in zip(paps, cons)] # now we apply a simple way to resolve directions by taking the first # occurence of links in the tree to be the innovation, and all dependent # links to be the source of borrowings # create a queue queue = ['root'] # make dictionary of innovated chars: these are currently all present in the # root, we order list as [inheritance,innovation,transfer] tracer = dict([(c[0], [0, 1, 0]) for c in acs['root']]) states = {} # start to iterate while queue: # get current node node = queue.pop(0) # get the children children = tree.getNodeMatchingName(node).Children # get the chars of the node node_chars = list(set([c[0] for c in acs[node]])) # if there are children for child in children: # get the node name name = child.Name # append name to the queue queue += [name] # get the chars of the child chars = list(set([c[0] for c in acs[name]])) inn = 0 ret = 0 bor = 0 # iterate over chars and decide where they come from for char in chars: if char not in wordlist.singletons or not singletons: # if char is inherited, increase the score if char in node_chars: tracer[char][0] += 1 ret += 1 # if occurs the first time, it is an innovation elif char not in tracer: tracer[char] = [0, 1, 0] inn += 1 # if it is in the tracer elif char not in node_chars and char in tracer: tracer[char][2] += 1 bor += 1 states[name] = [ret, inn, bor] print(name, inn, ret, bor, str(child)) # calculate the scores ret = sum([c[0] for c in tracer.values()]) inn = sum([c[1] for c in tracer.values()]) tra = sum([c[2] for c in tracer.values()]) ipn = inn / len(acs) tpn = tra / len(acs) total2 = ipn + tpn log.info("Innovations: {0}, {1:.2f}, {2:.2f}".format(inn, ipn, ipn / total2)) log.info("Transferred: {0}, {1:.2f}, {2:.2f}".format(tra, tpn, tpn / total2)) if return_dists: leaves = [] nodes = [] for node in [n for n in tree.getNodeNames() if n != 'root']: innovations = states[node][1] + states[node][2] if node in tree.taxa: leaves += [innovations] else: nodes += [innovations] # evaluate using mwu p, r = sps.mstats.kruskalwallis(leaves, nodes) return p, r return inn, tra, tracer
def calculate_data(wordlist, data, taxa='taxa', concepts='concepts', ref='cogid', **keywords): """ Manipulate a wordlist object by adding different kinds of data. Parameters ---------- data : str The type of data that shall be calculated. Currently supports * "tree": calculate a reference tree based on shared cognates * "dst": get distances between taxa based on shared cognates * "cluster": cluster the taxa into groups using different methods """ logger = log.get_logger() util.setdefaults(keywords, distances=False, tree_calc="upgma", cluster="upgma", force=False, threshold=0.5, cluster_method='upgma') # get taxa for current calculation these_taxa = eval('wordlist.' + taxa) # calculate distances if data in ['distances', 'dst']: wordlist._meta['distances'] = wl2dst(wordlist, taxa, concepts, ref, **keywords) elif data in ['diversity', 'div']: etd = wordlist.get_etymdict(ref=ref) wordlist._meta['diversity'] = \ (len(etd) - wordlist.height) / (len(wordlist) - wordlist.height) elif data in ['tre', 'tree', 'nwk']: if 'distances' not in wordlist._meta: wordlist._meta['distances'] = \ wl2dst(wordlist, taxa, concepts, ref, **keywords) distances = wordlist._meta['distances'] if 'tree' in wordlist._meta and not keywords['force']: logger.warning("Reference tree has already been calculated, " "force overwrite by " "setting 'force' to 'True'.") return wordlist._meta['tree'] = clustering.matrix2tree( distances, these_taxa, keywords['tree_calc'], keywords['distances']) elif data in ['groups', 'cluster']: if 'distances' not in wordlist._meta: distances = wl2dst(wordlist, taxa, concepts, ref, **keywords) else: distances = wordlist._meta['distances'] if 'groups' in wordlist._meta and not keywords['force']: logger.warning("Distance matrix has already been calculated, " "force overwrite by " "setting 'force' to 'True'.") return wordlist._meta['groups'] = clustering.matrix2groups( keywords['threshold'], distances, these_taxa, keywords['cluster_method']) log.info("Successfully calculated {0}.".format(data))
def calculate_data( wordlist, data, taxa='taxa', concepts='concepts', ref='cogid', **keywords): """ Manipulate a wordlist object by adding different kinds of data. Parameters ---------- data : str The type of data that shall be calculated. Currently supports * "tree": calculate a reference tree based on shared cognates * "dst": get distances between taxa based on shared cognates * "cluster": cluster the taxa into groups using different methods """ logger = log.get_logger() util.setdefaults( keywords, distances=False, tree_calc="upgma", cluster="upgma", force=False, threshold=0.5, cluster_method='upgma') # get taxa for current calculation these_taxa = eval('wordlist.' + taxa) # calculate distances if data in ['distances', 'dst']: wordlist._meta['distances'] = wl2dst( wordlist, taxa, concepts, ref, **keywords) elif data in ['diversity', 'div']: etd = wordlist.get_etymdict(ref=ref) wordlist._meta['diversity'] = \ (len(etd) - wordlist.height) / (len(wordlist) - wordlist.height) elif data in ['tre', 'tree', 'nwk']: if 'distances' not in wordlist._meta: wordlist._meta['distances'] = \ wl2dst(wordlist, taxa, concepts, ref, **keywords) distances = wordlist._meta['distances'] if 'tree' in wordlist._meta and not keywords['force']: logger.warn( "Reference tree has already been calculated, " "force overwrite by " "setting 'force' to 'True'.") return wordlist._meta['tree'] = clustering.matrix2tree( distances, these_taxa, keywords['tree_calc'], keywords['distances']) elif data in ['groups', 'cluster']: if 'distances' not in wordlist._meta: distances = wl2dst(wordlist, taxa, concepts, ref, **keywords) else: distances = wordlist._meta['distances'] if 'groups' in wordlist._meta and not keywords['force']: logger.warn( "Distance matrix has already been calculated, " "force overwrite by " "setting 'force' to 'True'.") return wordlist._meta['groups'] = clustering.matrix2groups( keywords['threshold'], distances, these_taxa, keywords['cluster_method']) log.info("Successfully calculated {0}.".format(data))