Exemplo n.º 1
0
def renumber(wordlist, source, target='', override=False):
    """
    Create numerical identifiers from string identifiers.
    """
    # iterate over wordlist and get all source ids
    sources = sorted(set([
        text_type(wordlist[k, source]) for k in wordlist]))

    # convert to numbers
    targets = list(range(1, len(sources) + 1))

    # add to wordlist
    target = target or (source + 'id')

    # make converter
    converter = dict(zip(sources, targets))

    # check for zero ids
    if 0 in converter:
        converter[0] = 0
    if '' in converter:
        converter[''] = 0

    wordlist.add_entries(
        target, source, lambda x: converter[text_type(x)], override=override)

    # add stuff to meta
    wordlist._meta[source + '2' + target] = converter
    log.info("Successfully renumbered {0}.".format(source))
Exemplo n.º 2
0
def renumber(wordlist, source, target='', override=False):
    """
    Create numerical identifiers from string identifiers.
    """
    # iterate over wordlist and get all source ids
    sources = sorted(set([str(wordlist[k, source]) for k in wordlist]))

    # convert to numbers
    targets = list(range(1, len(sources) + 1))

    # add to wordlist
    target = target or (source + 'id')

    # make converter
    converter = dict(zip(sources, targets))

    # check for zero ids
    if 0 in converter:
        converter[0] = 0
    if '' in converter:
        converter[''] = 0

    wordlist.add_entries(target,
                         source,
                         lambda x: converter[str(x)],
                         override=override)

    # add stuff to meta
    wordlist._meta[source + '2' + target] = converter
    log.info("Successfully renumbered {0}.".format(source))
Exemplo n.º 3
0
def _get_colexifications(wordlist,
                         entry='ipa',
                         concept='concept',
                         family='family'):
    """
    Helper function computes colexifications for a given set of languages in a
    wordlist.
    """
    if family not in wordlist.header:
        family = 'doculect'

    taxa = wordlist.cols
    colexifications = []
    for taxon in taxa:
        log.info('Analyzing taxon {0}...'.format(taxon))

        tmp_idxs = wordlist.get_list(taxon=taxon, flat=True)
        tmp_family = wordlist[tmp_idxs[0], family]
        tmp_concepts = wordlist.get_list(taxon=taxon, flat=True, entry=concept)
        tmp_entries = wordlist.get_list(taxon=taxon, flat=True, entry=entry)

        # iterate over all concepts and add them to the graph
        for (i, c1), (j, c2) in combinations2(enumerate(tmp_concepts)):
            if tmp_entries[i] == tmp_entries[j] and c1 != c2:
                colexifications += [(c1, c2, taxon, tmp_family, tmp_entries[i])
                                    ]

    return colexifications
Exemplo n.º 4
0
 def test_convenience():
     info('m')
     warning('m')
     debug('m')
     error('m')
     deprecated('o', 'n')
     missing_module('m')
     file_written('f')
Exemplo n.º 5
0
 def test_convenience():
     info('m')
     warning('m')
     debug('m')
     error('m')
     deprecated('o', 'n')
     missing_module('m')
     file_written('f')
Exemplo n.º 6
0
    def test_convenience(self):
        from lingpy.log import info, warn, debug, error, deprecated, missing_module, file_written

        info('m')
        warn('m')
        debug('m')
        error('m')
        deprecated('o', 'n')
        missing_module('m')
        file_written('f')
Exemplo n.º 7
0
    def test_convenience(self):
        from lingpy.log import info, warn, debug, error, deprecated, missing_module, file_written

        info('m')
        warn('m')
        debug('m')
        error('m')
        deprecated('o', 'n')
        missing_module('m')
        file_written('f')
Exemplo n.º 8
0
def compile_dvt(path=''):
    """
    Function compiles diacritics, vowels, and tones.

    Notes
    -----
    Diacritics, vowels, and tones are defined in the :file:`data/models/dv/` directory
    of the LingPy package and automatically loaded when loading the LingPy
    library. The values are defined as the constants
    :py:obj:`rcParams['vowels']`, :py:obj:`rcParams['diacritics']`, and
    :py:obj:`rcParams['tones']`. Their core purpose is to guide the
    tokenization of IPA strings (cf.
    :py:func:`~lingpy.sequence.sound_classes.ipa2tokens`). In order to change the
    variables, one simply has to change the text files :file:`diacritics`,
    :file:`tones`, and
    :file:`vowels` in the :file:`data/models/dv` directory. The structure of
    these files is fairly simple: Each line contains a vowel or a diacritic
    character, whereas diacritics are preceded by a dash.
    
    See also
    --------
    lingpy.data.model.Model
    lingpy.data.derive.compile_model
    """
    log.info("Compiling diacritics and vowels...")

    # get the path to the models
    if not path:
        file_path = util.data_path('models', 'dvt')
    elif path in ['evolaemp', 'el']:
        file_path = util.data_path('models', 'dvt_el')
    else:
        file_path = path

    def _read_string(name):
        # normalize stuff
        # TODO: this is potentially dangerous and it is important to decide whether
        # TODO: switching to NFD might not be a better choice
        return util.read_text_file(os.path.join(file_path, name),
                                   normalize='NFC').replace('\n', '')

    diacritics = _read_string('diacritics').replace('-', '')
    vowels = ''.join(
        [v for v in _read_string('vowels') if v not in diacritics])
    tones = _read_string('tones')

    dvt = (diacritics, vowels, tones)

    if path in ['evolaemp', 'el']:
        cache.dump(dvt, 'dvt_el')
    else:
        cache.dump(dvt, 'dvt')

    log.info("Diacritics and sound classes were successfully compiled.")
Exemplo n.º 9
0
def compile_dvt(path=''):
    """
    Function compiles diacritics, vowels, and tones.

    Notes
    -----
    Diacritics, vowels, and tones are defined in the :file:`data/models/dv/` directory
    of the LingPy package and automatically loaded when loading the LingPy
    library. The values are defined as the constants
    :py:obj:`rcParams['vowels']`, :py:obj:`rcParams['diacritics']`, and
    :py:obj:`rcParams['tones']`. Their core purpose is to guide the
    tokenization of IPA strings (cf.
    :py:func:`~lingpy.sequence.sound_classes.ipa2tokens`). In order to change the
    variables, one simply has to change the text files :file:`diacritics`,
    :file:`tones`, and
    :file:`vowels` in the :file:`data/models/dv` directory. The structure of
    these files is fairly simple: Each line contains a vowel or a diacritic
    character, whereas diacritics are preceded by a dash.
    
    See also
    --------
    lingpy.data.model.Model
    lingpy.data.derive.compile_model
    """
    log.info("Compiling diacritics and vowels...")

    # get the path to the models
    if not path:
        file_path = util.data_path('models', 'dvt')
    elif path in ['evolaemp', 'el']:
        file_path = util.data_path('models', 'dvt_el')
    else:
        file_path = path

    def _read_string(name):
        # normalize stuff
        # TODO: this is potentially dangerous and it is important to decide whether
        # TODO: switching to NFD might not be a better choice
        return util.read_text_file(
            os.path.join(file_path, name), normalize='NFC').replace('\n', '')

    diacritics = _read_string('diacritics').replace('-', '')
    vowels = ''.join([v for v in _read_string('vowels') if v not in diacritics])
    tones = _read_string('tones')

    dvt = (diacritics, vowels, tones)

    if path in ['evolaemp', 'el']:
        cache.dump(dvt, 'dvt_el')
    else:
        cache.dump(dvt, 'dvt')

    log.info("Diacritics and sound classes were successfully compiled.")
Exemplo n.º 10
0
    def songbook(self, *poems, filename='poems.tex'):
        """Export songs to latex songbook"""

        text = ""
        meta = self._meta.get('poems', {})
        if poems[0] == '*':
            poems = self.cols
        for poem in poems:
            log.info('analyzing poem' + poem)
            idxs = sorted(self.get_list(col=poem, flat=True),
                          key=lambda x:
                          (self[x, 'stanza'], self[x, 'line_order']))
            text += r'\begin{{song}}{{{title}}}{{}}{{}}{{{author}}}{{}}{{}}'.format(
                title=meta.get(poem, {}).get('title'),
                author=meta.get(poem, {}).get('author')) + '\n'
            stanza, before = '', ''
            for idx in idxs:
                new_stanza, refrain = self[idx, 'stanza'], self[idx, 'refrain']
                if new_stanza != stanza:

                    if before == 'verse':
                        text += r'\end{SBVerse}' + '\n'
                    elif before == 'refrain':
                        text += r'\end{SBChorus}' + '\n'
                    if refrain:
                        text += r'\begin{SBChorus}' + '\n'
                        before = 'refrain'
                    else:
                        before = 'verse'
                        text += r'\begin{SBVerse}' + '\n'
                    stanza = new_stanza
                line = []
                for word, chords in zip(self[idx, 'line'].n, self[idx,
                                                                  'chords']):
                    small_line = []
                    for syl, chord in zip(word, chords.split()):
                        syl = syl.replace('_', ' ')
                        if chord.strip('_').strip():
                            small_line += [r'\Ch{' + chord + '}{' + syl + '}']
                        else:
                            small_line += [syl]
                    line += ['-'.join(small_line)]
                text += ' '.join(line) + '\n\n'
            if refrain:
                text += r'\end{SBChorus}' + '\n'
            else:
                text += r'\end{SBVerse}' + '\n'
            text += r'\end{song}' + '\n\n'
        pathlib.Path(filename).write_text(text, encoding='utf8')
        log.info('wrote file {0}'.format(filename))
Exemplo n.º 11
0
def _import_sound_classes(filename):
    """
    Function imports individually defined sound classes from a text file and
    creates a replacement dictionary from these sound classes.
    """
    sc_repl_dict = {}
    errors = []
    for key, values in _read(filename, normalize='NFC').items():
        for value in values:
            log.info('%s' % ((value, key),))
            if value in sc_repl_dict and sc_repl_dict[value] != key:
                errors += [value]
            sc_repl_dict[value] = key
    if errors:
        raise ValueError("Values {0} in file {1} are multiply defined!".format(
            ' // '.join(sorted(set(errors))), filename))
    return sc_repl_dict
Exemplo n.º 12
0
def _import_sound_classes(filename):
    """
    Function imports individually defined sound classes from a text file and
    creates a replacement dictionary from these sound classes.
    """
    sc_repl_dict = {}
    errors = []
    for key, values in _read(filename, normalize='NFC').items():
        for value in values:
            log.info('%s' % ((value, key), ))
            if value in sc_repl_dict and sc_repl_dict[value] != key:
                errors += [value]
            sc_repl_dict[value] = key
    if errors:
        raise ValueError("Values {0} in file {1} are multiply defined!".format(
            ' // '.join(sorted(set(errors))), filename))
    return sc_repl_dict
Exemplo n.º 13
0
def _get_colexifications(wordlist, entry='ipa', concept='concept', family='family'):
    """
    Helper function computes colexifications for a given set of languages in a
    wordlist.
    """
    if family not in wordlist.header:
        family = 'doculect'

    taxa = wordlist.cols
    colexifications = []
    for taxon in taxa:
        log.info('Analyzing taxon {0}...'.format(taxon))

        tmp_idxs = wordlist.get_list(taxon=taxon, flat=True)
        tmp_family = wordlist[tmp_idxs[0], family]
        tmp_concepts = wordlist.get_list(taxon=taxon, flat=True, entry=concept)
        tmp_entries = wordlist.get_list(taxon=taxon, flat=True, entry=entry)

        # iterate over all concepts and add them to the graph
        for (i, c1), (j, c2) in combinations2(enumerate(tmp_concepts)):
            if tmp_entries[i] == tmp_entries[j] and c1 != c2:
                colexifications += [(c1, c2, taxon, tmp_family, tmp_entries[i])]

    return colexifications
Exemplo n.º 14
0
def compile_model(model, path=None):
    """
    Function compiles customized sound-class models.

    Parameters
    ----------

    model : str
        A string indicating the name of the model which shall be created.

    path : str
        A string indication the path where the model-folder is stored.

    Notes
    -----
    A model is defined by a folder placed in :file:`data/models` directory of
    the LingPy package. The name of the folder reflects the name of the model.
    It contains three files: the file :file:`converter`, the file :file:`INFO`,
    and the optional file :file:`scorer`. The format requirements for these
    files are as follows:

    :file:`INFO`
        The ``INFO``-file serves as a reference for a given sound-class model.
        It can contain arbitrary information (and also be empty). If one wants
        to define specific characteristics, like the ``source``, the
        ``compiler``, the ``date``, or a ``description`` of a given model,
        this can be done by employing a key-value structure in which the key is
        preceded by an ``@`` and followed by a colon and the value is written
        right next to the key in the same line, e.g.::
            
            @source: Dolgopolsky (1986)

        This information will then be read from the ``INFO`` file and rendered
        when printing the model to screen with help of the :py:func:`print`
        function.

    :file:`converter`
        The ``converter`` file contains all sound classes which are matched
        with their respective sound values. Each line is reserved for one
        class, precede by the key (preferably an ASCII-letter) representing the
        class::

            B : ɸ, β, f, p͡f, p͜f, ƀ
            E : ɛ, æ, ɜ, ɐ, ʌ, e, ᴇ, ə, ɘ, ɤ, è, é, ē, ě, ê, ɚ
            D : θ, ð, ŧ, þ, đ
            G : x, ɣ, χ
            ...
    
    :file:`matrix`
        A scoring matrix indicating the alignment scores of all sound-class
        characters defined by the model. The scoring is structured as a simple
        tab-delimited text file. The first cell contains the character names,
        the following cells contain the scores in redundant form (with both
        triangles being filled)::

            B  10.0 -10.0   5.0 ...
            E -10.0   5.0 -10.0 ...
            F   5.0 -10.0  10.0 ...
            ...

    :file:`scorer`
        The ``scorer`` file (which is optional) contains the graph of
        class-transitions which is used for the calculation of the scoring
        dictionary. Each class is listed in a separate line, followed by the
        symbols ``v``,``c``, or ``t`` (indicating whether the class
        represents vowels, consonants, or tones), and by the classes it is
        directly connected to. The strength of this connection is indicated by
        digits (the smaller the value, the shorter the path between the
        classes)::

            A : v, E:1, O:1
            C : c, S:2
            B : c, W:2
            E : v, A:1, I:1
            D : c, S:2
            ...
        
        The information in such a file is automatically converted into a
        scoring dictionary (see :evobib:`List2012b` for details).

    Based on the information provided by the files, a dictionary for the
    conversion of IPA-characters to sound classes and a scoring dictionary are
    created and stored as a binary.  The model can be loaded with help of the
    :py:class:`~lingpy.data.model.Model` class and used in the various classes
    and functions provided by the library.
    
    See also
    --------
    lingpy.data.model.Model
    compile_dvt

    """
    log.info("Compiling model <" + model + ">...")
    # get the path to the models
    new_path = lambda *cmps: os.path.join(path or util.data_path('models'),
                                          model, *cmps)

    log.debug("Model-Path: %s" % new_path)

    # load the sound classes
    sound_classes = _import_sound_classes(new_path('converter'))

    # dump the data
    cache.dump(sound_classes, model + '.converter')
    log.info("... successfully created the converter.")

    # try to load the scoring function or the score tree
    scorer = False

    if os.path.isfile(new_path('matrix')):
        scorer = read_scorer(new_path('matrix'))
    elif os.path.isfile(new_path('scorer')):
        score_tree = _import_score_tree(new_path('scorer'))

        # calculate the scoring dictionary
        score_dict = _make_scoring_dictionary(score_tree)

        # make score_dict a ScoreDict instance
        chars = sorted(set([s[0] for s in score_dict.keys()]))
        matrix = [[0 for i in range(len(chars))] for j in range(len(chars))]
        for (i, charA), (j,
                         charB) in util.multicombinations2(enumerate(chars)):
            if i < j:
                matrix[i][j] = score_dict.get((charA, charB), -100)
                matrix[j][i] = score_dict.get((charB, charA), -100)
            elif i == j:
                matrix[i][j] = score_dict[charA, charB]

        scorer = misc.ScoreDict(chars, matrix)
        util.write_text_file(new_path('matrix'), scorer2str(scorer))

    if scorer:
        cache.dump(scorer, model + '.scorer')
        log.info("... successfully created the scorer.")
    else:
        log.info("... no scoring dictionary defined.")

    log.info("Model <" + model + "> was compiled successfully.")
Exemplo n.º 15
0
def star2qlc(filename, clean_taxnames=False, debug=False):
    """
    Converts a file directly output from starling to LingPy-QLC format.
    """
    cleant = clean_taxnames or identity
    data = csv2list(filename)

    # check for strange chars in data due to notepad errors
    data[0][0] = data[0][0].replace('\ufeff', '')

    # get the header
    header = data[0]

    # debugging
    if debug:
        error = False
        log.info("Header line has length {0}.".format(len(header)))
        for line in data[1:]:
            if len(line) != len(header):  # pragma: no cover
                log.error(
                    "Error for item {0} with length {1}, expected {2}.".format(
                        '/'.join(line[0:2]), len(line), len(header)))
                error = True
        if error:  # pragma: no cover
            log.error("Errors were found, aborting function call.")
            return
        else:
            log.info("Everything went fine, carrying on with function call.")

    # determine language names in header
    taxa = []
    for i in range(len(header) - 1):
        prev = header[i]
        post = header[i + 1]

        if prev in post and '#' in post:
            taxa += [prev]

            if len(taxa) == 1:
                lngIdx = i

        if prev == 'Number':
            numIdx = i

        if prev == 'Word':
            wrdIdx = i

    log.info('starling, indices (%s, %s, %s)' % (lngIdx, numIdx, wrdIdx))
    log.info('starling, taxa: %s' % taxa)

    # start filling in the dictionary
    D = {
        0: [
            'DOCULECT', 'CONCEPT', 'GLOSSID', 'WORDINSOURCE', 'ORTHOGRAPHY',
            'IPA', 'COGID'
        ]
    }

    idx = 1
    cognate_counter = 0
    current_concept = ''
    cognate_sets = []
    for line in data[2:]:
        gloss = line[wrdIdx]
        gnum = line[numIdx]

        # switch to next cognate set if there is a switch in concepts
        if current_concept != gloss and len(cognate_sets) != 0:
            max_cog = max(cognate_sets)
            cognate_counter = max_cog
            cognate_sets = []
            current_concept = gloss
        else:
            log.debug('starling, indices (%s, %s, %s)' %
                      (gloss, current_concept, cognate_counter))

        for i in range(lngIdx, len(header), 2):
            word = line[i]

            if '{' in word:
                ipa = word[:word.index('{')].strip()
                ortho = word[word.index('{') + 1:word.index('}')].strip()
            else:
                ipa = word
                ortho = word

            cogid = int(line[i + 1])

            if cogid != 0 and word:
                if cogid > 0:
                    cogid = cogid + cognate_counter

                # append cognate sets, essential for raising the counter
                cognate_sets += [int(cogid)]

                taxon = cleant(header[i])

                D[idx] = [taxon, gloss, gnum, word, ortho, ipa, cogid]
                idx += 1

    # re-iterate through data and reassign cognate sets with negative ids
    for k in D:
        if k:
            cogid = D[k][-1]
            if cogid < 0:
                cogid = -cognate_counter
                cognate_counter += 1
                D[k][-1] = cogid

    return D
Exemplo n.º 16
0
def rc(rval=None, **keywords):
    """
    Function changes parameters globally set for LingPy sessions.

    Parameters
    ----------
    rval : string (default=None)
        Use this keyword to specify a return-value for the rc-function.
    schema : {"ipa", "asjp"}
        Change the basic schema for sequence comparison. When switching to
        "asjp", this means that sequences will be treated as sequences in ASJP
        code, otherwise, they will be treated as sequences written in basic
        IPA.

    Notes
    -----
    This function is the standard way to communicate with the *rcParams*
    dictionary which is not imported as a default. If you want to see which
    parameters there are, you can load the rcParams dictonary directly::

    >>> from lingpy.settings import rcParams

    However, be careful when changing the values. They might produce some
    unexpected behavior.

    Examples
    --------
    Import LingPy:

    >>> from lingpy import *

    Switch from IPA transcriptions to ASJP transcriptions:

    >>> rc(schema="asjp")

    You can check which "basic orthography" is currently loaded:

    >>> rc(basic_orthography)
    'asjp'
    >>> rc(schema='ipa')
    >>> rc(basic_orthography)
    'fuzzy'

    """
    from lingpy import log

    if rval:
        return rcParams[rval]

    for key in keywords:
        if key == "schema":
            if keywords[key] in ["qlc", 'ipa']:
                diacritics, vowels, tones = load_dvt(path='')
                rcParams['asjp'] = Model('asjp')
                rcParams['sca'] = Model('sca')
                rcParams['dolgo'] = Model('dolgo')
                rcParams['art'] = Model('art')
                rcParams['diacritics'] = diacritics
                rcParams['vowels'] = vowels
                rcParams['tones'] = tones
                rcParams['_color'] = Model('color')
                rcParams['combiners'] = '\u0361\u035c'
                rcParams['breaks'] = '.-'
                rcParams['stress'] = "ˈˌ'"
                rcParams['merge_vowels'] = True
                rcParams['basic_orthography'] = 'fuzzy'

                # reset basic model to sca
                rcParams['model'] = rcParams['sca']

            elif keywords[key] in ['evolaemp', 'el', 'asjp']:
                diacritics, vowels, tones = load_dvt(path='el')
                rcParams['asjp'] = Model('asjp_el')
                rcParams['sca'] = Model('sca_el')
                rcParams['dolgo'] = Model('dolgo_el')
                rcParams['art'] = Model('art_el')
                rcParams['jaeger'] = Model('jaeger_el')
                rcParams['diacritics'] = diacritics
                rcParams['vowels'] = vowels
                rcParams['tones'] = tones
                rcParams['_color'] = Model('color_el')
                rcParams['combiners'] = '\u0361\u035c'
                rcParams['breaks'] = '.-'
                rcParams['stress'] = "ˈˌ'"
                rcParams['merge_vowels'] = False
                rcParams['basic_orthography'] = 'asjp'

                # reset the basic model to the asjp model
                rcParams['model'] = rcParams['asjp']

        if key in alias:
            rcParams[alias[key]] = keywords[key]
        else:
            rcParams[key] = keywords[key]
    log.info("Successfully changed parameters.")
Exemplo n.º 17
0
def tstats(wordlist,
           glm='',
           network=False,
           acs=False,
           tree=False,
           singletons=True,
           return_dists=False):
    """
    Calculate transmission statistics for a given MLN.
    """

    # check for attributes
    # return if no glm and no network is given
    if not glm and not network:
        raise ValueError(
            "You must specify at least one network or a gain-loss model.")

    # check for acs and network
    if glm:
        # network = wordlist.graph[glm]
        acs = wordlist.acs[glm]

    # check for tree
    if not tree:
        tree = wordlist.tree

    # add the distributions of the leaves to the acs
    for t in tree.taxa:
        paps = wordlist.get_list(taxa=t, entry='pap', flat=True)
        cons = wordlist.get_list(taxa=t, entry='concept', flat=True)

        acs[t] = [(p, c) for p, c in zip(paps, cons)]

    # now we apply a simple way to resolve directions by taking the first
    # occurence of links in the tree to be the innovation, and all dependent
    # links to be the source of borrowings

    # create a queue
    queue = ['root']

    # make dictionary of innovated chars: these are currently all present in the
    # root, we order list as [inheritance,innovation,transfer]
    tracer = dict([(c[0], [0, 1, 0]) for c in acs['root']])
    states = {}

    # start to iterate
    while queue:

        # get current node
        node = queue.pop(0)

        # get the children
        children = tree.getNodeMatchingName(node).Children

        # get the chars of the node
        node_chars = list(set([c[0] for c in acs[node]]))

        # if there are children
        for child in children:

            # get the node name
            name = child.Name

            # append name to the queue
            queue += [name]

            # get the chars of the child
            chars = list(set([c[0] for c in acs[name]]))

            inn = 0
            ret = 0
            bor = 0
            # iterate over chars and decide where they come from
            for char in chars:

                if char not in wordlist.singletons or not singletons:
                    # if char is inherited, increase the score
                    if char in node_chars:
                        tracer[char][0] += 1
                        ret += 1

                    # if occurs the first time, it is an innovation
                    elif char not in tracer:
                        tracer[char] = [0, 1, 0]
                        inn += 1

                    # if it is in the tracer
                    elif char not in node_chars and char in tracer:
                        tracer[char][2] += 1
                        bor += 1

            states[name] = [ret, inn, bor]

    # calculate the scores
    ret = sum([c[0] for c in tracer.values()])
    inn = sum([c[1] for c in tracer.values()])
    tra = sum([c[2] for c in tracer.values()])

    ipn = inn / len(acs)
    tpn = tra / len(acs)

    total2 = ipn + tpn

    log.info("Innovations: {0}, {1:.2f}, {2:.2f}".format(
        inn, ipn, ipn / total2))
    log.info("Transferred: {0}, {1:.2f}, {2:.2f}".format(
        tra, tpn, tpn / total2))

    if return_dists:
        leaves = []
        nodes = []
        for node in [n for n in tree.getNodeNames() if n != 'root']:
            innovations = states[node][1] + states[node][2]
            if node in tree.taxa:
                leaves += [innovations]
            else:
                nodes += [innovations]

        # evaluate using mwu
        p, r = sps.mstats.kruskalwallis(leaves, nodes)

        return p, r

    return inn, tra, tracer
Exemplo n.º 18
0
def rc(rval=None, **keywords):
    """
    Function changes parameters globally set for LingPy sessions.

    Parameters
    ----------
    rval : string (default=None)
        Use this keyword to specify a return-value for the rc-function.
    schema : {"ipa", "asjp"}
        Change the basic schema for sequence comparison. When switching to
        "asjp", this means that sequences will be treated as sequences in ASJP
        code, otherwise, they will be treated as sequences written in basic
        IPA.
    verbose : bool (default=False)
        Use this keyword in order to switch to verbose output. This will be
        useful when using complex methods, in order to understand what the
        program is actually doing.
    debug : bool (default=False)
        Use this keyword to switch to debug-mode. It will give specific,
        internal output that is much more technical than the output resulting
        from "verbose".

    Notes
    -----
    This function is the standard way to communicate with the *rcParams*
    dictionary which is not imported as a default. If you want to see which
    parameters there are, you can load the rcParams dictonary directly::

    >>> from lingpy.settings import rcParams

    However, be careful when changing the values. They might produce some
    unexpected behavior.

    Examples
    --------
    Import LingPy:

    >>> from lingpy import *

    Change basic values. Switch to verbose output, for example:

    >>> rc(verbose=True)
    Successfully changed parameters.
    """
    from lingpy import log

    if rval:
        return rcParams[rval]

    for key in keywords:
        if key == "schema":
            if keywords[key] in ["qlc", 'ipa']:
                diacritics, vowels, tones = load_dvt(path='')
                rcParams['asjp'] = Model('asjp')
                rcParams['sca'] = Model('sca')
                rcParams['dolgo'] = Model('dolgo')
                rcParams['art'] = Model('art')
                rcParams['diacritics'] = diacritics
                rcParams['vowels'] = vowels
                rcParams['tones'] = tones
                rcParams['_color'] = Model('color')
                rcParams['combiners'] = '\u0361\u035c'
                rcParams['breaks'] = '.-'
                rcParams['stress'] = "ˈˌ'"
                rcParams['merge_vowels'] = True
                rcParams['basic_orthography'] = 'fuzzy'

                # reset basic model to sca
                rcParams['model'] = rcParams['sca']

            elif keywords[key] in ['evolaemp', 'el', 'asjp']:
                diacritics, vowels, tones = load_dvt(path='el')
                rcParams['asjp'] = Model('asjp_el')
                rcParams['sca'] = Model('sca_el')
                rcParams['dolgo'] = Model('dolgo_el')
                rcParams['art'] = Model('art_el')
                rcParams['jaeger'] = Model('jaeger_el')
                rcParams['diacritics'] = diacritics
                rcParams['vowels'] = vowels
                rcParams['tones'] = tones
                rcParams['_color'] = Model('color_el')
                rcParams['combiners'] = '\u0361\u035c'
                rcParams['breaks'] = '.-'
                rcParams['stress'] = "ˈˌ'"
                rcParams['merge_vowels'] = False
                rcParams['basic_orthography'] = 'asjp'

                # reset the basic model to the asjp model
                rcParams['model'] = rcParams['asjp']

        if key in alias:
            rcParams[alias[key]] = keywords[key]
        else:
            rcParams[key] = keywords[key]
    log.info("Successfully changed parameters.")
Exemplo n.º 19
0
    def _get_partial_randist(self, **keywords):
        """
        Return the aligned results of randomly aligned sequences.
        """
        kw = dict(modes=rcParams['lexstat_modes'],
                  factor=rcParams['align_factor'],
                  restricted_chars=rcParams['restricted_chars'],
                  runs=rcParams['lexstat_runs'],
                  rands=rcParams['lexstat_rands'],
                  limit=rcParams['lexstat_limit'],
                  method=rcParams['lexstat_scoring_method'])
        kw.update(keywords)

        # determine the mode
        method = 'markov' if kw['method'] in ['markov', 'markov-chain', 'mc'] \
            else 'shuffle'

        corrdist = {}
        tasks = (self.width**2) / 2
        with util.pb(desc='RANDOM CORRESPONDENCE CALCULATION',
                     total=tasks) as progress:
            for (i, tA), (j,
                          tB) in util.multicombinations2(enumerate(self.cols)):
                progress.update(1)
                log.info("Calculating random alignments"
                         "for pair {0}/{1}.".format(tA, tB))
                corrdist[tA, tB] = defaultdict(float)

                # create morpheme-segmented pairs
                pairs = self.pairs[tA, tB]
                new_nums, new_weights, new_pros = [], [], []
                for idxA, idxB in pairs:
                    for iA, iB in self._slices[idxA]:
                        for jA, jB in self._slices[idxB]:
                            new_nums += [(self[idxA, self._numbers][iA:iB],
                                          self[idxB, self._numbers][jA:jB])]
                            new_weights += [(self[idxA, self._weights][iA:iB],
                                             self[idxB, self._weights][jA:jB])]
                            new_pros += [(self[idxA, self._prostrings][iA:iB],
                                          self[idxB, self._prostrings][jA:jB])]
                # get the number pairs etc.
                sample = [(x, y) for x in range(len(new_nums))
                          for y in range(len(new_nums))]
                if len(sample) > kw['runs']:
                    sample = random.sample(sample, kw['runs'])

                for mode, gop, scale in kw['modes']:
                    corrs, included = calign.corrdist(
                        10.0, [(new_nums[s[0]][0], new_nums[s[1]][1])
                               for s in sample],
                        [(new_weights[s[0]][0], new_weights[s[1]][1])
                         for s in sample],
                        [(new_pros[s[0]][0], new_pros[s[1]][1])
                         for s in sample], gop, scale, kw['factor'],
                        self.bscorer, mode, kw['restricted_chars'])

                    # change representation of gaps
                    for a, b in list(corrs.keys()):
                        # get the correspondence count
                        d = corrs[a, b] * self._included[tA, tB] / included
                        # XXX check XXX* len(self.pairs[tA,tB]) / runs

                        # check for gaps
                        if a == '-':
                            a = util.charstring(i + 1)
                        elif b == '-':
                            b = util.charstring(j + 1)

                        corrdist[tA, tB][a, b] += d / len(kw['modes'])
        return corrdist
Exemplo n.º 20
0
def context_profile(wordlist,
                    ref='ipa',
                    col="doculect",
                    semi_diacritics='hsʃ̢ɕʂʐʑʒw',
                    merge_vowels=False,
                    brackets=None,
                    splitters='/,;~',
                    merge_geminates=True,
                    clts=False,
                    bad_word="<???>",
                    bad_sound="<?>",
                    unknown_sound="!{0}",
                    examples=2,
                    max_entries=100):
    """
    Create an advanced Orthography Profile with context and doculect information.

    Parameters
    ----------
    wordlist : ~lingpy.basic.wordlist.Wordlist
        A wordlist from which you want to derive an initial
        orthography profile.
    ref : str (default="ipa")
        The name of the reference column in which the words are stored.
    col : str (default="doculect")
        Indicate in which column the information on the language variety is
        stored.
    semi_diacritics : str
        Indicate characters which can occur both as "diacritics" (second part
        in a sound) or alone.
    merge_vowels : bool (default=True)
        Indicate whether consecutive vowels should be merged.
    brackets : dict
        A dictionary with opening brackets as key and closing brackets as
        values. Defaults to a pre-defined set of frequently occurring brackets.
    splitters : str
        The characters which force the automatic splitting of an entry.
    clts : dict (default=None)
        A dictionary(like) object that converts a given source sound into a
        potential target sound, using the get()-method of the dictionary.
        Normally, we think of a CLTS instance here (that is: a cross-linguistic
        transcription system as defined in the pyclts package).
    bad_word : str (default="«???»")
        Indicate how words that could not be parsed should be handled. Note
        that both "bad_word" and "bad_sound" are format-strings, so you can add
        formatting information here.
    bad_sound : str (default="«?»")
        Indicate how sounds that could not be converted to a sound class be
        handled. Note that both "bad_word" and "bad_sound" are format-strings,
        so you can add formatting information here.
    unknown_sound : str (default="!{0}")
        If with_clts is set to True, use this string to indicate that sounds
        are classified as "unknown sound" in the CLTS framework.
    examples : int(default=2)
        Indicate the number of examples that should be printed out.

    Returns
    -------
    profile : generator
        A generator of tuples (three items), indicating the segment, its frequency,
        the conversion to sound classes in the Dolgopolsky sound-class model,
        and the unicode-codepoints.
    """
    clts_ = clts or {}
    nulls = set()
    bad_words = set()
    brackets = brackets or "([{『(₍⁽«)]})』⁾₎"
    profile = defaultdict(list)
    errors = set()
    for idx, word, language in pb(wordlist.iter_rows(ref, col),
                                  desc='iter words',
                                  total=len(wordlist)):
        log.info('processing {0}-{1}'.format(idx, word))
        if isinstance(word, list):
            word = ' '.join(word)
        if word.strip():
            try:
                cleaned_string = clean_string(
                    word,
                    semi_diacritics=semi_diacritics,
                    merge_vowels=merge_vowels,
                    brackets=None,
                    ignore_brackets=False,
                    split_entries=False,
                    preparse=None,
                    rules=None,
                    merge_geminates=merge_geminates)[0].split(' ')

                # retain whole word if there are splitters in the word
                if [x for x in cleaned_string if x in brackets + splitters]:
                    profile[word] += [(language, word)]
                    bad_words.add(word)
                else:
                    context_pre = ['^'] + (len(cleaned_string) - 1) * ['']
                    context_post = (len(cleaned_string) - 1) * [''] + ['$']
                    for ctxA, ctxB, segment in zip(context_pre, context_post,
                                                   cleaned_string):
                        profile[ctxA + segment + ctxB] += [(language, word)]
                    for segment in [
                            x for x in word
                            if x not in ' '.join(cleaned_string)
                    ]:
                        profile[segment] += [(language, word)]
                        nulls.add(segment)
            except:
                errors.add(idx)
                log.warn('problem parsing {0}'.format(word))

    for s in '^$':
        yield s, 'NULL', '', '', '', ''

    for idx, (s, entries) in pb(enumerate(
            sorted(profile.items(), key=lambda x: len(x[1]), reverse=True)),
                                desc='yielding entries',
                                total=len(profile)):
        sclass = token2class(s.strip('^$'), 'dolgo')
        words, langs = [l[1] for l in entries
                        ][:max_entries], [l[0] for l in entries][:max_entries]
        languages = ', '.join(
            sorted(set(langs), key=lambda x: langs.count(x), reverse=True))
        frequency = str(len(langs))
        codepoints = codepoint(s)
        examples_ = ', '.join(
            sorted(set(words), key=lambda x: words.count(x),
                   reverse=True)[:examples])
        if s in bad_words:
            ipa = bad_word.format(s)
        elif sclass == '0':
            ipa = bad_sound.format(s)
        elif s in nulls:
            ipa = 'NULL'
        elif clts_:
            sound = clts_.get(s.strip('^$'), False)
            if not sound:
                ipa = '!' + s.strip('^$')
            else:
                ipa = text_type(sound)
        else:
            ipa = s.strip('^$')

        yield s, ipa, examples_, languages, frequency, codepoints
Exemplo n.º 21
0
    def _get_partial_corrdist(self, **keywords):
        """
        Use alignments to get a correspondences statistics.
        """
        kw = dict(
            cluster_method='upgma',
            factor=rcParams['align_factor'],
            gop=rcParams['align_gop'],
            modes=rcParams['lexstat_modes'],
            preprocessing=False,
            preprocessing_method=rcParams['lexstat_preprocessing_method'],
            preprocessing_threshold=rcParams[
                'lexstat_preprocessing_threshold'],
            split_on_tones=False,
            ref='scaid',
            restricted_chars=rcParams['restricted_chars'],
            threshold=rcParams['lexstat_scoring_threshold'],
            subset=False)
        kw.update(keywords)

        self._included = {}
        corrdist = {}

        if kw['preprocessing']:
            if kw['ref'] not in self.header:
                self.cluster(method=kw['preprocessing_method'],
                             threshold=kw['preprocessing_threshold'],
                             gop=kw['gop'],
                             cluster_method=kw['cluster_method'],
                             ref=kw['ref'])

        with util.pb(desc='CORRESPONDENCE CALCULATION',
                     total=self.width**2 / 2) as pb:
            for (i, tA), (j,
                          tB) in util.multicombinations2(enumerate(self.cols)):
                pb.update(1)
                log.info("Calculating alignments for pair {0} / {1}.".format(
                    tA, tB))

                corrdist[tA, tB] = defaultdict(float)
                for mode, gop, scale in kw['modes']:
                    pairs = self.pairs[tA, tB]
                    if kw['subset']:
                        pairs = [
                            pair for pair in pairs
                            if pair in self.subsets[tA, tB]
                        ]

                    # threshold and preprocessing, make sure threshold is
                    # different from pre-processing threshold when
                    # preprocessing is set to false
                    if kw['preprocessing']:
                        pairs = [
                            pair for pair in pairs
                            if self[pair, kw['ref']][0] == self[pair,
                                                                kw['ref']][1]
                        ]
                        threshold = 10.0
                    else:
                        threshold = kw['threshold']

                    # create morpheme-segmented pairs
                    new_nums, new_weights, new_pros = [], [], []
                    for idxA, idxB in pairs:
                        for iA, iB in self._slices[idxA]:
                            for jA, jB in self._slices[idxB]:
                                new_nums += [(self[idxA, self._numbers][iA:iB],
                                              self[idxB,
                                                   self._numbers][jA:jB])]
                                new_weights += [(self[idxA,
                                                      self._weights][iA:iB],
                                                 self[idxB,
                                                      self._weights][jA:jB])]
                                new_pros += [(self[idxA,
                                                   self._prostrings][iA:iB],
                                              self[idxB,
                                                   self._prostrings][jA:jB])]

                    corrs, self._included[tA, tB] = calign.corrdist(
                        threshold, new_nums, new_weights, new_pros, gop, scale,
                        kw['factor'], self.bscorer, mode,
                        kw['restricted_chars'])

                    # change representation of gaps
                    for (a, b), d in corrs.items():
                        # XXX check for bias XXX
                        if a == '-':
                            a = util.charstring(i + 1)
                        elif b == '-':
                            b = util.charstring(j + 1)
                        corrdist[tA, tB][a, b] += d / float(len(kw['modes']))

        return corrdist
Exemplo n.º 22
0
def find_threshold(matrix,
                   thresholds=[i * 0.05 for i in range(1, 19)][::-1],
                   logs=True):
    """
    Use a variant of the method by :evobib:`Apeltsin2011` in order to find an optimal
    threshold.

    Parameters
    ----------
    matrix : list
        The distance matrix for which the threshold shall be determined.
    thresholds : list (default=[i*0.05 for i in range(1,19)[::-1])
        The range of thresholds that shall be tested.
    logs : {bool,builtins.function} (default=True)
        If set to **True**, the logarithm of the score beyond the threshold will
        be assigned as weight to the graph. If set to c{False} all weights will
        be set to 1. Use a custom function to define individual ways to
        calculate the weights.

    Returns
    -------
    threshold : {float,None}
        If a float is returned, this is the threshold identified by the method.
        If **None** is returned, no threshold could be identified.

    Notes
    -----
    This is a very simple method that may not work well depending on the
    dataset. So we recommend to use it with great care.
    """

    # get the old degree of the matrix
    odeg = _get_wad(matrix, 1)

    # store the plateaus (where nothing changes in the network)
    plato = {0: [1.0]}

    # this is the current index of the last plateau
    ci = 0
    minc = 0
    alls = []

    # start iterating and calculating
    for i, t in enumerate(thresholds[1:], 1):
        # get the new degree of the matrix under threshold t
        ndeg = _get_wad(matrix, t, logs)

        # if there is a new degree
        if ndeg:
            # get the change in comparison with the old degree
            cdeg = ndeg - odeg

            if cdeg < minc:
                minc = cdeg

            # swap old degree to new degree
            odeg = ndeg

            # if there's a plateau, the changed degree should be equal or
            # greater zero
            if cdeg >= 0:
                plato[ci] += [t]
            else:
                plato[i] = [t]
                ci = i

            alls += [(t, ndeg)]

    # try to find the plateau of maximal length
    sorted_plato = sorted(plato, key=lambda x: len(plato[x]), reverse=True)
    log.info('Found {0} thresholds.'.format(
        len([p for p in plato if len(plato[p]) > 1])))
    log.info('... %s' % (sorted([len(plato[p])
                                 for p in plato], reverse=True), ))
    # check if first entry is NOT of length 1
    try:
        return [
            sum(plato[t]) / len(plato[t]) for t in sorted_plato
            if len(plato[t]) > 1
        ][0]
    except:
        return
Exemplo n.º 23
0
def compile_model(model, path=None):
    """
    Function compiles customized sound-class models.

    Parameters
    ----------

    model : str
        A string indicating the name of the model which shall be created.

    path : str
        A string indication the path where the model-folder is stored.

    Notes
    -----
    A model is defined by a folder placed in :file:`data/models` directory of
    the LingPy package. The name of the folder reflects the name of the model.
    It contains three files: the file :file:`converter`, the file :file:`INFO`,
    and the optional file :file:`scorer`. The format requirements for these
    files are as follows:

    :file:`INFO`
        The ``INFO``-file serves as a reference for a given sound-class model.
        It can contain arbitrary information (and also be empty). If one wants
        to define specific characteristics, like the ``source``, the
        ``compiler``, the ``date``, or a ``description`` of a given model,
        this can be done by employing a key-value structure in which the key is
        preceded by an ``@`` and followed by a colon and the value is written
        right next to the key in the same line, e.g.::
            
            @source: Dolgopolsky (1986)

        This information will then be read from the ``INFO`` file and rendered
        when printing the model to screen with help of the :py:func:`print`
        function.

    :file:`converter`
        The ``converter`` file contains all sound classes which are matched
        with their respective sound values. Each line is reserved for one
        class, precede by the key (preferably an ASCII-letter) representing the
        class::

            B : ɸ, β, f, p͡f, p͜f, ƀ
            E : ɛ, æ, ɜ, ɐ, ʌ, e, ᴇ, ə, ɘ, ɤ, è, é, ē, ě, ê, ɚ
            D : θ, ð, ŧ, þ, đ
            G : x, ɣ, χ
            ...
    
    :file:`matrix`
        A scoring matrix indicating the alignment scores of all sound-class
        characters defined by the model. The scoring is structured as a simple
        tab-delimited text file. The first cell contains the character names,
        the following cells contain the scores in redundant form (with both
        triangles being filled)::

            B  10.0 -10.0   5.0 ...
            E -10.0   5.0 -10.0 ...
            F   5.0 -10.0  10.0 ...
            ...

    :file:`scorer`
        The ``scorer`` file (which is optional) contains the graph of
        class-transitions which is used for the calculation of the scoring
        dictionary. Each class is listed in a separate line, followed by the
        symbols ``v``,``c``, or ``t`` (indicating whether the class
        represents vowels, consonants, or tones), and by the classes it is
        directly connected to. The strength of this connection is indicated by
        digits (the smaller the value, the shorter the path between the
        classes)::

            A : v, E:1, O:1
            C : c, S:2
            B : c, W:2
            E : v, A:1, I:1
            D : c, S:2
            ...
        
        The information in such a file is automatically converted into a
        scoring dictionary (see :evobib:`List2012b` for details).

    Based on the information provided by the files, a dictionary for the
    conversion of IPA-characters to sound classes and a scoring dictionary are
    created and stored as a binary.  The model can be loaded with help of the
    :py:class:`~lingpy.data.model.Model` class and used in the various classes
    and functions provided by the library.
    
    See also
    --------
    lingpy.data.model.Model
    compile_dvt

    """
    log.info("Compiling model <" + model + ">...")
    # get the path to the models
    new_path = lambda *cmps: os.path.join(path or util.data_path('models'), model, *cmps)

    log.debug("Model-Path: %s" % new_path)

    # load the sound classes
    sound_classes = _import_sound_classes(new_path('converter'))

    # dump the data
    cache.dump(sound_classes, model + '.converter')
    log.info("... successfully created the converter.")

    # try to load the scoring function or the score tree
    scorer = False

    if os.path.isfile(new_path('matrix')):
        scorer = read_scorer(new_path('matrix'))
    elif os.path.isfile(new_path('scorer')):
        score_tree = _import_score_tree(new_path('scorer'))

        # calculate the scoring dictionary
        score_dict = _make_scoring_dictionary(score_tree)

        # make score_dict a ScoreDict instance
        chars = sorted(set([s[0] for s in score_dict.keys()]))
        matrix = [[0 for i in range(len(chars))] for j in
                  range(len(chars))]
        for (i, charA), (j, charB) in util.multicombinations2(enumerate(chars)):
            if i < j:
                matrix[i][j] = score_dict.get((charA, charB), -100)
                matrix[j][i] = score_dict.get((charB, charA), -100)
            elif i == j:
                matrix[i][j] = score_dict[charA, charB]

        scorer = misc.ScoreDict(chars, matrix)
        util.write_text_file(new_path('matrix'), scorer2str(scorer))

    if scorer:
        cache.dump(scorer, model + '.scorer')
        log.info("... successfully created the scorer.")
    else:
        log.info("... no scoring dictionary defined.")

    log.info("Model <" + model + "> was compiled successfully.")
Exemplo n.º 24
0
def star2qlc(filename, clean_taxnames=False, debug=False):
    """
    Converts a file directly output from starling to LingPy-QLC format.
    """
    cleant = clean_taxnames or identity
    data = csv2list(filename)

    # check for strange chars in data due to notepad errors
    data[0][0] = data[0][0].replace('\ufeff', '')

    # get the header
    header = data[0]

    # debugging
    if debug:
        error = False
        log.info("Header line has length {0}.".format(len(header)))
        for line in data[1:]:
            if len(line) != len(header):  # pragma: no cover
                log.error("Error for item {0} with length {1}, expected {2}.".format(
                    '/'.join(line[0:2]), len(line), len(header)))
                error = True
        if error:  # pragma: no cover
            log.error("Errors were found, aborting function call.")
            return
        else:
            log.info("Everything went fine, carrying on with function call.")

    # determine language names in header   
    taxa = []
    for i in range(len(header) - 1):
        prev = header[i]
        post = header[i + 1]
        
        if prev in post and '#' in post:
            taxa += [prev]

            if len(taxa) == 1:
                lngIdx = i

        if prev == 'Number':
            numIdx = i

        if prev == 'Word':
            wrdIdx = i
    
    log.info('starling, indices (%s, %s, %s)' % (lngIdx, numIdx, wrdIdx))
    log.info('starling, taxa: %s' % taxa)

    # start filling in the dictionary
    D = {0: [
        'DOCULECT', 'CONCEPT', 'GLOSSID', 'WORDINSOURCE', 'ORTHOGRAPHY', 'IPA', 'COGID']}

    idx = 1
    cognate_counter = 0
    current_concept = ''
    cognate_sets = []
    for line in data[2:]:
        gloss = line[wrdIdx]
        gnum = line[numIdx]

        # switch to next cognate set if there is a switch in concepts
        if current_concept != gloss and len(cognate_sets) != 0:
            max_cog = max(cognate_sets)
            cognate_counter = max_cog 
            cognate_sets = []
            current_concept = gloss
        else:
            if debug:
                print(gloss, current_concept, cognate_counter)

        for i in range(lngIdx, len(header), 2):
            word = line[i]
            
            if '{' in word:
                ipa = word[:word.index('{')].strip()
                ortho = word[word.index('{') + 1:word.index('}')].strip()
            else:
                ipa = word
                ortho = word
            
            cogid = int(line[i + 1])

            if cogid != 0 and word:
                
                if cogid > 0:
                    cogid = cogid + cognate_counter
                else:
                    pass

                # append cognate sets, essential for raising the counter
                cognate_sets += [int(cogid)]
                
                taxon = cleant(header[i])

                D[idx] = [taxon, gloss, gnum, word, ortho, ipa, cogid]
                idx += 1

    # re-iterate through data and reassign cognate sets with negative ids
    for k in D:
        if k:
            cogid = D[k][-1]
            if cogid < 0:
                cogid = -cognate_counter
                cognate_counter += 1
                D[k][-1] = cogid

    return D
Exemplo n.º 25
0
def rc(rval=None, **keywords):
    """
    Function changes parameters globally set for LingPy sessions.

    Parameters
    ----------
    rval : string (default=None)
        Use this keyword to specify a return-value for the rc-function.
    schema : {"ipa", "asjp"}
        Change the basic schema for sequence comparison. When switching to
        "asjp", this means that sequences will be treated as sequences in ASJP
        code, otherwise, they will be treated as sequences written in basic
        IPA.

    Notes
    -----
    This function is the standard way to communicate with the *rcParams*
    dictionary which is not imported as a default. If you want to see which
    parameters there are, you can load the rcParams dictonary directly::

    >>> from lingpy.settings import rcParams

    However, be careful when changing the values. They might produce some
    unexpected behavior.

    Examples
    --------
    Import LingPy:

    >>> from lingpy import *

    Switch from IPA transcriptions to ASJP transcriptions:

    >>> rc(schema="asjp")

    You can check which "basic orthography" is currently loaded:

    >>> rc(basic_orthography)
    'asjp'
    >>> rc(schema='ipa')
    >>> rc(basic_orthography)
    'fuzzy'

    """
    from lingpy import log

    if rval:
        return rcParams[rval]

    for key in keywords:
        if key == "schema":
            if keywords[key] in ["qlc", 'ipa']:
                diacritics, vowels, tones = load_dvt(path='')
                rcParams['asjp'] = Model('asjp')
                rcParams['sca'] = Model('sca')
                rcParams['dolgo'] = Model('dolgo')
                rcParams['art'] = Model('art')
                rcParams['diacritics'] = diacritics
                rcParams['vowels'] = vowels
                rcParams['tones'] = tones
                rcParams['_color'] = Model('color')
                rcParams['combiners'] = '\u0361\u035c'
                rcParams['breaks'] = '.-'
                rcParams['stress'] = "ˈˌ'"
                rcParams['merge_vowels'] = True
                rcParams['basic_orthography'] = 'fuzzy'

                # reset basic model to sca
                rcParams['model'] = rcParams['sca']

            elif keywords[key] in ['evolaemp', 'el', 'asjp']:
                diacritics, vowels, tones = load_dvt(path='el')
                rcParams['asjp'] = Model('asjp_el')
                rcParams['sca'] = Model('sca_el')
                rcParams['dolgo'] = Model('dolgo_el')
                rcParams['art'] = Model('art_el')
                rcParams['jaeger'] = Model('jaeger_el')
                rcParams['diacritics'] = diacritics
                rcParams['vowels'] = vowels
                rcParams['tones'] = tones
                rcParams['_color'] = Model('color_el')
                rcParams['combiners'] = '\u0361\u035c'
                rcParams['breaks'] = '.-'
                rcParams['stress'] = "ˈˌ'"
                rcParams['merge_vowels'] = False
                rcParams['basic_orthography'] = 'asjp'

                # reset the basic model to the asjp model
                rcParams['model'] = rcParams['asjp']

        if key in alias:
            rcParams[alias[key]] = keywords[key]
        else:
            rcParams[key] = keywords[key]
    log.info("Successfully changed parameters.")
Exemplo n.º 26
0
def find_threshold(matrix, thresholds=[i * 0.05 for i in range(1, 19)][::-1], logs=True):
    """
    Use a variant of the method by :evobib:`Apeltsin2011` in order to find an optimal
    threshold.

    Parameters
    ----------
    matrix : list
        The distance matrix for which the threshold shall be determined.
    thresholds : list (default=[i*0.05 for i in range(1,19)[::-1])
        The range of thresholds that shall be tested.
    logs : {bool,builtins.function} (default=True)
        If set to **True**, the logarithm of the score beyond the threshold will
        be assigned as weight to the graph. If set to c{False} all weights will
        be set to 1. Use a custom function to define individual ways to
        calculate the weights.

    Returns
    -------
    threshold : {float,None}
        If a float is returned, this is the threshold identified by the method.
        If **None** is returned, no threshold could be identified.

    Notes
    -----
    This is a very simple method that may not work well depending on the
    dataset. So we recommend to use it with great care.
    """

    # get the old degree of the matrix
    odeg = _get_wad(matrix, 1)

    # store the plateaus (where nothing changes in the network)
    plato = {0: [1.0]}

    # this is the current index of the last plateau
    ci = 0
    minc = 0
    alls = []

    # start iterating and calculating
    for i, t in enumerate(thresholds[1:], 1):
        # get the new degree of the matrix under threshold t
        ndeg = _get_wad(matrix, t, logs)

        # if there is a new degree
        if ndeg:
            # get the change in comparison with the old degree
            cdeg = ndeg - odeg

            if cdeg < minc:
                minc = cdeg

            # swap old degree to new degree
            odeg = ndeg

            # if there's a plateau, the changed degree should be equal or
            # greater zero
            if cdeg >= 0:
                plato[ci] += [t]
            else:
                plato[i] = [t]
                ci = i

            alls += [(t, ndeg)]

    # try to find the plateau of maximal length
    sorted_plato = sorted(plato, key=lambda x: len(plato[x]), reverse=True)
    log.info('Found {0} thresholds.'.format(len([p for p in plato if len(plato[p]) > 1])))
    log.info('... %s' % (sorted([len(plato[p]) for p in plato], reverse=True),))
    # check if first entry is NOT of length 1
    try:
        return [sum(plato[t]) / len(plato[t])
                for t in sorted_plato if len(plato[t]) > 1][0]
    except:
        return
Exemplo n.º 27
0
def tstats(
    wordlist,
    glm='',
    network=False,
    acs=False,
    tree=False,
    singletons=True,
    return_dists=False
):
    """
    Calculate transmission statistics for a given MLN.
    """

    # check for attributes
    # return if no glm and no network is given
    if not glm and not network:
        raise ValueError("You must specify at least one network or a gain-loss model.")

    # check for acs and network
    if glm:
        # network = wordlist.graph[glm]
        acs = wordlist.acs[glm]

    # check for tree
    if not tree:
        tree = wordlist.tree

    # add the distributions of the leaves to the acs
    for t in tree.taxa:
        paps = wordlist.get_list(taxa=t, entry='pap', flat=True)
        cons = wordlist.get_list(taxa=t, entry='concept', flat=True)

        acs[t] = [(p, c) for p, c in zip(paps, cons)]

    # now we apply a simple way to resolve directions by taking the first
    # occurence of links in the tree to be the innovation, and all dependent
    # links to be the source of borrowings

    # create a queue
    queue = ['root']

    # make dictionary of innovated chars: these are currently all present in the
    # root, we order list as [inheritance,innovation,transfer]
    tracer = dict([(c[0], [0, 1, 0]) for c in acs['root']])
    states = {}

    # start to iterate
    while queue:

        # get current node
        node = queue.pop(0)

        # get the children
        children = tree.getNodeMatchingName(node).Children

        # get the chars of the node
        node_chars = list(set([c[0] for c in acs[node]]))

        # if there are children
        for child in children:

            # get the node name
            name = child.Name

            # append name to the queue
            queue += [name]

            # get the chars of the child
            chars = list(set([c[0] for c in acs[name]]))

            inn = 0
            ret = 0
            bor = 0
            # iterate over chars and decide where they come from
            for char in chars:

                if char not in wordlist.singletons or not singletons:
                    # if char is inherited, increase the score
                    if char in node_chars:
                        tracer[char][0] += 1
                        ret += 1

                    # if occurs the first time, it is an innovation
                    elif char not in tracer:
                        tracer[char] = [0, 1, 0]
                        inn += 1

                    # if it is in the tracer
                    elif char not in node_chars and char in tracer:
                        tracer[char][2] += 1
                        bor += 1

            states[name] = [ret, inn, bor]
            print(name, inn, ret, bor, str(child))

    # calculate the scores
    ret = sum([c[0] for c in tracer.values()])
    inn = sum([c[1] for c in tracer.values()])
    tra = sum([c[2] for c in tracer.values()])

    ipn = inn / len(acs)
    tpn = tra / len(acs)

    total2 = ipn + tpn

    log.info("Innovations: {0}, {1:.2f}, {2:.2f}".format(inn, ipn, ipn / total2))
    log.info("Transferred: {0}, {1:.2f}, {2:.2f}".format(tra, tpn, tpn / total2))

    if return_dists:
        leaves = []
        nodes = []
        for node in [n for n in tree.getNodeNames() if n != 'root']:
            innovations = states[node][1] + states[node][2]
            if node in tree.taxa:
                leaves += [innovations]
            else:
                nodes += [innovations]

        # evaluate using mwu
        p, r = sps.mstats.kruskalwallis(leaves, nodes)

        return p, r

    return inn, tra, tracer
Exemplo n.º 28
0
def calculate_data(wordlist,
                   data,
                   taxa='taxa',
                   concepts='concepts',
                   ref='cogid',
                   **keywords):
    """
    Manipulate a wordlist object by adding different kinds of data.

    Parameters
    ----------
    data : str
        The type of data that shall be calculated. Currently supports

        * "tree": calculate a reference tree based on shared cognates
        * "dst": get distances between taxa based on shared cognates
        * "cluster": cluster the taxa into groups using different methods


    """
    logger = log.get_logger()
    util.setdefaults(keywords,
                     distances=False,
                     tree_calc="upgma",
                     cluster="upgma",
                     force=False,
                     threshold=0.5,
                     cluster_method='upgma')

    # get taxa for current calculation
    these_taxa = eval('wordlist.' + taxa)

    # calculate distances
    if data in ['distances', 'dst']:
        wordlist._meta['distances'] = wl2dst(wordlist, taxa, concepts, ref,
                                             **keywords)
    elif data in ['diversity', 'div']:
        etd = wordlist.get_etymdict(ref=ref)
        wordlist._meta['diversity'] = \
            (len(etd) - wordlist.height) / (len(wordlist) - wordlist.height)
    elif data in ['tre', 'tree', 'nwk']:
        if 'distances' not in wordlist._meta:
            wordlist._meta['distances'] = \
                wl2dst(wordlist, taxa, concepts, ref, **keywords)
        distances = wordlist._meta['distances']
        if 'tree' in wordlist._meta and not keywords['force']:
            logger.warning("Reference tree has already been calculated, "
                           "force overwrite by "
                           "setting 'force' to 'True'.")
            return
        wordlist._meta['tree'] = clustering.matrix2tree(
            distances, these_taxa, keywords['tree_calc'],
            keywords['distances'])

    elif data in ['groups', 'cluster']:
        if 'distances' not in wordlist._meta:
            distances = wl2dst(wordlist, taxa, concepts, ref, **keywords)
        else:
            distances = wordlist._meta['distances']
        if 'groups' in wordlist._meta and not keywords['force']:
            logger.warning("Distance matrix has already been calculated, "
                           "force overwrite by "
                           "setting 'force' to 'True'.")
            return
        wordlist._meta['groups'] = clustering.matrix2groups(
            keywords['threshold'], distances, these_taxa,
            keywords['cluster_method'])
    log.info("Successfully calculated {0}.".format(data))
Exemplo n.º 29
0
def calculate_data(
        wordlist,
        data,
        taxa='taxa',
        concepts='concepts',
        ref='cogid',
        **keywords):
    """
    Manipulate a wordlist object by adding different kinds of data.

    Parameters
    ----------
    data : str
        The type of data that shall be calculated. Currently supports

        * "tree": calculate a reference tree based on shared cognates
        * "dst": get distances between taxa based on shared cognates
        * "cluster": cluster the taxa into groups using different methods


    """
    logger = log.get_logger()
    util.setdefaults(
        keywords,
        distances=False,
        tree_calc="upgma",
        cluster="upgma",
        force=False,
        threshold=0.5,
        cluster_method='upgma')

    # get taxa for current calculation
    these_taxa = eval('wordlist.' + taxa)

    # calculate distances
    if data in ['distances', 'dst']:
        wordlist._meta['distances'] = wl2dst(
                wordlist, taxa, concepts, ref, **keywords)
    elif data in ['diversity', 'div']:
        etd = wordlist.get_etymdict(ref=ref)
        wordlist._meta['diversity'] = \
            (len(etd) - wordlist.height) / (len(wordlist) - wordlist.height)
    elif data in ['tre', 'tree', 'nwk']:
        if 'distances' not in wordlist._meta:
            wordlist._meta['distances'] = \
                wl2dst(wordlist, taxa, concepts, ref, **keywords)
        distances = wordlist._meta['distances']
        if 'tree' in wordlist._meta and not keywords['force']:
            logger.warn(
                    "Reference tree has already been calculated, "
                    "force overwrite by "
                    "setting 'force' to 'True'.")
            return
        wordlist._meta['tree'] = clustering.matrix2tree(
            distances, these_taxa, keywords['tree_calc'],
            keywords['distances'])

    elif data in ['groups', 'cluster']:
        if 'distances' not in wordlist._meta:
            distances = wl2dst(wordlist, taxa, concepts, ref, **keywords)
        else:
            distances = wordlist._meta['distances']
        if 'groups' in wordlist._meta and not keywords['force']:
            logger.warn(
                    "Distance matrix has already been calculated, "
                    "force overwrite by "
                    "setting 'force' to 'True'.")
            return
        wordlist._meta['groups'] = clustering.matrix2groups(
            keywords['threshold'], distances, these_taxa,
            keywords['cluster_method'])
    log.info("Successfully calculated {0}.".format(data))