Exemplo n.º 1
0
def check_strict_cognates(
        wordlist,
        ref='crossids',
        segments='tokens'
        ):
    """Check if cognates are really strict."""
    fails, errors = [], 0
    etd = wordlist.get_etymdict(ref=ref)
    for cogid in etd:
        for vals in etd[cogid]:
            if vals:
                if not str(cogid).isdigit():
                    fails += vals
                else:
                    alms = []
                    for idx in vals:
                        try:
                            tokens = wordlist[idx, segments].n[wordlist[idx,
                                ref].index(cogid)]
                            alms += [(idx, tokens)]
                        except:
                            fails += [idx]
                    for idx, tokens in alms[1:]:
                        if str(tokens) != str(alms[0][1]):
                            fails += [idx]
                            errors += 1
                            log.warning('{0} | {1} | {2} | {3:15} | {4:15}'.format(
                                        errors, idx, alms[0][0],
                                        str(tokens), str(alms[0][1])))
    return fails
Exemplo n.º 2
0
def check_strict_cognates(wordlist, ref='crossids', segments='tokens'):
    """Check if cognates are really strict."""
    fails, errors = [], 0
    etd = wordlist.get_etymdict(ref=ref)
    for cogid in etd:
        for vals in etd[cogid]:
            if vals:
                if not str(cogid).isdigit():
                    fails += vals
                else:
                    alms = []
                    for idx in vals:
                        try:
                            tokens = wordlist[idx, segments].n[wordlist[
                                idx, ref].index(cogid)]
                            alms += [(idx, tokens)]
                        except:
                            fails += [idx]
                    for idx, tokens in alms[1:]:
                        if str(tokens) != str(alms[0][1]):
                            fails += [idx]
                            errors += 1
                            log.warning(
                                '{0} | {1} | {2} | {3:15} | {4:15}'.format(
                                    errors, idx, alms[0][0], str(tokens),
                                    str(alms[0][1])))
    return fails
Exemplo n.º 3
0
def check_sequence_length(
        wordlist, 
        entities=['tokens', 'crossids', 'morphemes', 'structure'],
        dimensions=[2, 1, 2, 1]
        ):
    """Function checks for identical sequence length in different columns.
    """
    fails, errors = [], 0
    for (eA, dA), (eB, dB) in combinations(zip(entities, dimensions), r=2):
        for idx in wordlist:
            if not check_length(
                    wordlist[idx, eA], 
                    wordlist[idx, eB],
                    dA,
                    dB
                    ):
                errors += 1
                log.warning(
                        '{0} | {1} | {2} | {3} | {4} | {5}'.format(
                            errors,
                            idx,
                            eA,
                            eB,
                            wordlist[idx, eA],
                            wordlist[idx, eB]
                            )
                        )
                fails += [(idx, eA, eB)]
    return fails
Exemplo n.º 4
0
 def test_convenience():
     info('m')
     warning('m')
     debug('m')
     error('m')
     deprecated('o', 'n')
     missing_module('m')
     file_written('f')
Exemplo n.º 5
0
def check_cognates(wordlist, ref='crossids'):
    """Function checks for internal consistency of partial cognates."""
    fails = []
    for idx, cogids in wordlist.iter_rows(ref):
        if len(set(cogids)) != len(cogids):
            log.warning('duplicates in {0}'.format(cogids))
            fails += [idx]
    return fails
Exemplo n.º 6
0
def check_cognates(wordlist, ref='crossids'):
    """Function checks for internal consistency of partial cognates."""
    fails = []
    for idx, cogids in wordlist.iter_rows(ref):
        if len(set(cogids)) != len(cogids):
            log.warning('duplicates in {0}'.format(cogids))
            fails += [idx]
    return fails
Exemplo n.º 7
0
 def test_convenience():
     info('m')
     warning('m')
     debug('m')
     error('m')
     deprecated('o', 'n')
     missing_module('m')
     file_written('f')
Exemplo n.º 8
0
def _get_brackets(brackets):

    out = defaultdict(str)
    for b in brackets:
        out[b] = unicodedata.lookup(unicodedata.name(b).replace('LEFT', 'RIGHT'))
        if b == out[b]:
            log.warning('lingpy.sequence.sound_classes.get_brackets' + \
                    'Item «{0}» does not have a counterpart!'.format(b))
    return out
Exemplo n.º 9
0
def _get_brackets(brackets):

    out = defaultdict(str)
    for b in brackets:
        out[b] = unicodedata.lookup(unicodedata.name(b).replace('LEFT', 'RIGHT'))
        if b == out[b]:
            log.warning('lingpy.sequence.sound_classes.get_brackets' + \
                    'Item «{0}» does not have a counterpart!'.format(b))
    return out
Exemplo n.º 10
0
def string2html(
    taxon,
    string,
    swaps=[],
    tax_len=None
    ):
    """
    Function converts an (aligned) string into colored html-format.
    
    @deprecated
    """

    # determine the length of the string
    if not tax_len:
        tax_len = len(taxon)

    # set the tr-line
    tr = '<tr class="msa">\n{0}\n</tr>'

    # set the td_taxon-line
    td_taxon = '<td class="taxon" width="' + str(15 * tax_len) + '">{0}</td>\n'

    # get the percentage scaling factor
    perc = int(80 / len(string) + 0.5)

    # get vals for residue and swaps
    td_residue = '<td class="residue" width="50" align="center" bgcolor="{1}">' + \
                 '<font color="{2}">{0}</font></td>\n'
    td_swap = '<td class="residue swap" style="border:solid 3px black" width="50"' + \
              'align="center" bgcolor="{1}"><font color="{2}">{0}</font></td>\n'

    # start with filling the taxon
    out = ''
    out += td_taxon.format(taxon)

    # go on with the colors
    for i, char in enumerate(string):
        try:
            c = rcParams['_color'][char]
            fg = '#000000'
        except:
            try:
                c = rcParams['_color'][char[0]]
                fg = '#000000'
            except KeyError:
                log.warning("Unknown character '" + char + "', press ANY key to continue. ")
                c = '#ffffff'
                fg = '#eb3410'

        if i in swaps:
            out += td_swap.format(char, c, fg)
        else:
            out += td_residue.format(char, c, fg)

    return out
Exemplo n.º 11
0
def string2html(taxon, string, swaps=[], tax_len=None):
    """
    Function converts an (aligned) string into colored html-format.
    
    @deprecated
    """

    # determine the length of the string
    if not tax_len:
        tax_len = len(taxon)

    # set the tr-line
    tr = '<tr class="msa">\n{0}\n</tr>'

    # set the td_taxon-line
    td_taxon = '<td class="taxon" width="' + str(15 * tax_len) + '">{0}</td>\n'

    # get the percentage scaling factor
    perc = int(80 / len(string) + 0.5)

    # get vals for residue and swaps
    td_residue = '<td class="residue" width="50" align="center" bgcolor="{1}">' + \
                 '<font color="{2}">{0}</font></td>\n'
    td_swap = '<td class="residue swap" style="border:solid 3px black" width="50"' + \
              'align="center" bgcolor="{1}"><font color="{2}">{0}</font></td>\n'

    # start with filling the taxon
    out = ''
    out += td_taxon.format(taxon)

    # go on with the colors
    for i, char in enumerate(string):
        try:
            c = rcParams['_color'][char]
            fg = '#000000'
        except:
            try:
                c = rcParams['_color'][char[0]]
                fg = '#000000'
            except KeyError:
                log.warning("Unknown character '" + char +
                            "', press ANY key to continue. ")
                c = '#ffffff'
                fg = '#eb3410'

        if i in swaps:
            out += td_swap.format(char, c, fg)
        else:
            out += td_residue.format(char, c, fg)

    return out
Exemplo n.º 12
0
def tokens2html(
    string,
    swaps=[],
    tax_len=None,
):
    """
    Function converts an (aligned) string into colored html-format.

    Notes
    -----
    This function is currently not used by any other program. So it might be
    useful to just deprecate it.

    @deprecated
    """
    # set the tr-line
    tr = '<tr class="msa">\n{0}\n</tr>'

    # get the percentage scaling factor
    perc = int(80 / len(string) + 0.5)

    # get vals for residue and swaps
    td_residue = '<td class="residue" width="50" align="center" bgcolor="{1}">' + \
                 '<font color="{2}">{0}</font></td>\n'
    td_swap = '<td class="residue swap" style="border:solid 3px black" width="50"' + \
              'align="center" bgcolor="{1}"><font color="{2}">{0}</font></td>\n'

    # start with filling the taxon
    out = '<table>'

    # go on with the colors
    for i, char in enumerate(string):
        try:
            c = rcParams['_color'][char]
            fg = '#000000'
        except:
            try:
                c = rcParams['_color'][char[0]]
                fg = '#000000'
            except KeyError:
                log.warning("Unknown character '" + char +
                            "', press ANY key to continue. ")
                c = '#ffffff'
                fg = '#eb3410'

        if i in swaps:
            out += td_swap.format(char, c, fg)
        else:
            out += td_residue.format(char, c, fg)

    return out + '</table>'
Exemplo n.º 13
0
def tokens2html(
    string,
    swaps=[],
    tax_len=None,
):
    """
    Function converts an (aligned) string into colored html-format.

    Notes
    -----
    This function is currently not used by any other program. So it might be
    useful to just deprecate it.

    @deprecated
    """
    # set the tr-line
    tr = '<tr class="msa">\n{0}\n</tr>'

    # get the percentage scaling factor
    perc = int(80 / len(string) + 0.5)

    # get vals for residue and swaps
    td_residue = '<td class="residue" width="50" align="center" bgcolor="{1}">' + \
                 '<font color="{2}">{0}</font></td>\n'
    td_swap = '<td class="residue swap" style="border:solid 3px black" width="50"' + \
              'align="center" bgcolor="{1}"><font color="{2}">{0}</font></td>\n'

    # start with filling the taxon
    out = '<table>'

    # go on with the colors
    for i, char in enumerate(string):
        try:
            c = rcParams['_color'][char]
            fg = '#000000'
        except:
            try:
                c = rcParams['_color'][char[0]]
                fg = '#000000'
            except KeyError:
                log.warning("Unknown character '" + char + "', press ANY key to continue. ")
                c = '#ffffff'
                fg = '#eb3410'

        if i in swaps:
            out += td_swap.format(char, c, fg)
        else:
            out += td_residue.format(char, c, fg)

    return out + '</table>'
Exemplo n.º 14
0
def check_sequence_length(
        wordlist,
        entities=['tokens', 'crossids', 'morphemes', 'structure'],
        dimensions=[2, 1, 2, 1]):
    """Function checks for identical sequence length in different columns.
    """
    fails, errors = [], 0
    for (eA, dA), (eB, dB) in combinations(zip(entities, dimensions), r=2):
        for idx in wordlist:
            if not check_length(wordlist[idx, eA], wordlist[idx, eB], dA, dB):
                errors += 1
                log.warning('{0} | {1} | {2} | {3} | {4} | {5}'.format(
                    errors, idx, eA, eB, wordlist[idx, eA], wordlist[idx, eB]))
                fails += [idx]
    return fails
Exemplo n.º 15
0
def npoint_ap(scores, cognates, reverse=False):
    """
    Calculate the n-point average precision.
    
    Parameters
    ----------
    scores : list
        The scores of your algorithm for pairwise string comparison. 
    cognates : list
        The cognate codings of the word pairs you compared. 1 indicates that
        the pair is cognate, 0 indicates that it is not cognate.
    reverse : bool (default=False)
        The order of your ranking mechanism. If your algorithm yields high
        scores for words which are probably cognate, and low scores for
        non-cognate words, you should set this keyword to "True".

    Notes
    -----
    This follows the description in :evobib:`Kondrak2002`. The n-point average
    precision is useful to compare the discriminative force of different
    algorithms for string similarity, or to train the parameters of a given
    algorithm.

    Examples
    --------
    
    >>> scores = [1, 2, 3, 4, 5]
    >>> cognates = [1, 1, 1, 0, 0]
    >>> from lingpy.evaluate.acd import npoint_ap
    >>> npoint_ap(scores, cognates)
    1.0

    """
    p = 0.0
    cognate_count = 0
    for k, (score, cognate) in enumerate(
            sorted(zip(scores, cognates), key=lambda x: x[0],
                   reverse=reverse)):
        if cognate == 1:
            cognate_count += 1
            p += cognate_count / (k + 1.0)
    try:
        return p / cognates.count(1)
    except ZeroDivisionError:
        log.warning(
            "Encountered Zero Division in npoint_ap, your data seems to contain no cognates."
        )
        return 0
Exemplo n.º 16
0
def wl2multistate(wordlist, ref, missing):
    """
    Function converts a wordlist to multistate format (compatible with PAUP).
    """

    # convert the data to a multistate matrix
    # get etymological dictionary
    wordlist.get_etymdict(ref=ref)

    # define chars, we only have a limited set, unfortunately
    chars = ascii_letters + digits

    # iterate over all cognate sets and assign the chars
    matrix = []
    for c in wordlist.concepts:
        taxon_to_cognate_set = wordlist.get_dict(concept=c, entry=ref)

        distinct_states = set()
        for taxon in wordlist.taxa:
            distinct_states.update(taxon_to_cognate_set.get(taxon, [0]))

        # make converter
        if len(distinct_states) > len(chars):  # pragma: no cover
            # FIXME: This shouldn't just be a warning, because we
            # will get a KeyError
            # down below, since zip just returns a list of length len(chars)!
            log.warning('more distinct states than available characters!')
        char_map = dict(zip(sorted(distinct_states), chars))
        char_map['-'] = '-'

        line = []
        for taxon in wordlist.taxa:
            states = set(taxon_to_cognate_set.get(taxon, ['-']))
            # exclude the case len(taxon_to_cognate_set[taxon]) == 0
            if len(states) == 1:
                line.append(char_map[states.pop()])
            elif not states:
                line.append(missing)
            else:
                line.append('({0})'.format(
                    "".join([char_map[x] for x in sorted(states)])))

        matrix.append(line)

    return misc.transpose(matrix)
Exemplo n.º 17
0
def wl2multistate(wordlist, ref, missing):
    """
    Function converts a wordlist to multistate format (compatible with PAUP).
    """

    # convert the data to a multistate matrix
    # get etymological dictionary
    wordlist.get_etymdict(ref=ref)

    # define chars, we only have a limited set, unfortunately
    chars = ascii_letters + digits

    # iterate over all cognate sets and assign the chars
    matrix = []
    for c in wordlist.concepts:
        taxon_to_cognate_set = wordlist.get_dict(concept=c, entry=ref)

        distinct_states = set()
        for taxon in wordlist.taxa:
            distinct_states.update(taxon_to_cognate_set.get(taxon, [0]))

        # make converter
        if len(distinct_states) > len(chars):  # pragma: no cover
            # FIXME: This shouldn't just be a warning, because we
            # will get a KeyError
            # down below, since zip just returns a list of length len(chars)!
            log.warning('more distinct states than available characters!')
        char_map = dict(zip(sorted(distinct_states), chars))
        char_map['-'] = '-'

        line = []
        for taxon in wordlist.taxa:
            states = set(taxon_to_cognate_set.get(taxon, ['-']))
            # exclude the case len(taxon_to_cognate_set[taxon]) == 0
            if len(states) == 1:
                line.append(char_map[states.pop()])
            elif not states:
                line.append(missing)
            else:
                line.append('({0})'.format("".join(
                    [char_map[x] for x in sorted(states)])))

        matrix.append(line)

    return misc.transpose(matrix)
Exemplo n.º 18
0
def psa2html(infile, **kw):
    """
    Function converts a PSA-file into colored html-format.
    """
    util.setdefaults(
        kw,
        template=False,
        css=False,
        comment='#',
        filename=infile[:-4]+'.html',
        compact=True)

    template = util.read_text_file(kw['template'] or template_path('psa.html'))
    css = util.read_text_file(kw['css'] or template_path('psa.css'))

    data = []
    for line in util.read_text_file(infile, lines=True):
        if not line.startswith(kw['comment']):
            data.append(line)

    seq_ids = []
    pairs = []
    taxa = []
    alignments = []

    del data[0]

    i = 0
    while i <= len(data) - 3:
        try:
            seq_ids.append(data[i])

            datA = data[i + 1].split('\t')
            datB = data[i + 2].split('\t')

            taxonA = datA[0].strip('.')
            taxonB = datB[0].strip('.')
            almA = datA[1:]
            almB = datB[1:]

            taxa.append((taxonA, taxonB))
            pairs.append(
                (
                    '.'.join([k for k in almA if k != '-']),
                    '.'.join([k for k in almB if k != '-'])
                )
            )
            alignments.append(
                (
                    [str(a) for a in almA],
                    [str(b) for b in almB],
                    0)
            )
            assert len(alignments[-1][0]) == len(alignments[-1][1])
            i += 4
        except AssertionError:
            log.warning("Line {0} of the data is probably miscoded.".format(i + 1))
            i += 1

    def get_classes(alm):
        classes = []
        residue = '<div class="residue {1}">{0}</div>'
        for j, char in enumerate(alm):
            if char == '-':
                d = 'dolgo_GAP'
            else:
                d = 'dolgo_' + token2class(char, rcParams['dolgo'])

                # bad check for three classes named differently
                if d == 'dolgo__':
                    d = 'dolgo_X'
                elif d == 'dolgo_1':
                    d = 'dolgo_TONE'
                elif d == 'dolgo_0':
                    d = 'dolgo_ERROR'
            classes += [residue.format(char, d)]
        return ''.join(classes)

    out = '<table>\n'  # codecs.open(kw['filename'], 'w', 'utf-8')
    for i, (a, b, c) in enumerate(alignments):
        clsA = get_classes(a)
        clsB = get_classes(b)

        ids = int(100 * pid(a, b) + 0.5)

        out += '<tr class="head">'
        out += '<td colspan=2 class="head"><b>Alignment {0}:</b> <i>{1}</i>, PID: {2}</td></tr>'.format(
            i + 1,
            seq_ids[i],
            ids
        )
        out += '<tr class="psa">'
        out += '<td class="taxon">{0}</td>'.format(taxa[i][0])
        out += '<td class="psa">{0}</td>'.format(clsA)
        out += '</tr>'
        out += '<tr class="psa">'
        out += '<td class="taxon">{0}</td>'.format(taxa[i][1])
        out += '<td class="psa">{0}</td>'.format(clsB)
        out += '</tr>'
        out += '<tr><td colspan=2></td></tr>'

    out += '</table>'

    html = template.format(alignments=out, css=css)

    if kw['compact']:
        html = html.replace('\n', ' ')
        html = re.sub(r'\s+', r' ', html)
        html = html.replace('> ', '>')
        html = html.replace(' >', '>')

    util.write_text_file(kw['filename'], html)
Exemplo n.º 19
0
def read_qlc(infile, comment='#'):
    """
    Simple function that loads qlc-format into a dictionary.

    Parameters
    ----------
    infile : str
        The name of the input file.
    comment : str (default="#")
        The comment character. If a line starts with this character, it will be
        ignored.

    Returns
    -------
    d : dict
        A dictionary with integer keys corresponding to the order of the lines
        of the input file. The header is given 0 as a specific key.
    """
    lines = read_text_file(infile, lines=True, normalize="NFC")
    data, meta, dtype = [], {}, False

    while lines:
        line = lines.pop(0)
        if line.startswith(comment) or not line:
            continue

        if line.startswith('@'):
            key, value = [s.strip() for s in line[1:].split(':', 1)]
            if key == 'tree':
                meta["tree"] = cg.LoadTree(treestring=value)
            elif key == 'json':
                for j1, j2 in json.loads(value).items():
                    meta[j1] = j2
            else:
                if key not in meta:
                    meta[key] = value
                else:
                    if isinstance(meta[key], list):
                        meta[key].append(value)
                    else:
                        log.warning(
                            "Key '{0}' in input file is not unique! Use JSON-format for "
                            "these datatypes!".format(key))
                        meta[key] = [meta[key]] + [value]
        # line starts with complex stuff
        elif line.startswith('<'):
            tmp = line[1:line.index('>')]
            # check for specific keywords
            if ' ' in tmp:
                dtype = tmp.split(' ')[0]
                keys = {k: v[1:-1]
                        for k, v in [key.split('=') for key in tmp.split(' ')[1:]]}
            else:
                dtype = tmp.strip()
                keys = {}

            tmp = []

            while True:
                line = lines.pop(0)
                if line.startswith('</' + dtype + '>'):
                    break
                tmp += [line]

            tmp = '\n'.join(tmp)

            # check for data stuff
            if dtype == "json":
                tmp = json.loads(tmp)
                if not keys:
                    for key in tmp:
                        meta[key] = tmp[key]
                elif keys:
                    meta[keys["id"]] = {}
                    for k in tmp:
                        meta[keys["id"]][k] = tmp[k]
            elif dtype in ['tre', 'nwk']:
                if "trees" not in meta:
                    meta["trees"] = {}

                if not keys:
                    keys["id"] = "1"

                # XXX consider switching to Tree here XXX
                meta['trees'][keys["id"]] = cg.LoadTree(treestring=tmp)
            elif dtype in ['csv']:
                meta[keys["id"]] = {}
                ncol = int(keys.get('ncol', 2))

                if "dtype" in keys:
                    transf = eval(keys["dtype"])
                else:
                    transf = str

                # split tmp into lines
                tmp = tmp.split('\n')
                for l in tmp:
                    if ncol == 2:
                        a, b = l.split('\t')
                        b = transf(b)
                    else:
                        l = l.split('\t')
                        a = l[0]
                        b = [transf(b) for b in l[1:]]
                    meta[keys["id"]][a] = b
            elif dtype == 'msa':
                tmp = tmp.split('\n')
                if 'msa' not in meta:
                    meta['msa'] = {}

                ref = keys.get('ref', 'cogid')
                if ref not in meta['msa']:
                    meta['msa'][ref] = {}

                tmp_msa = {}
                try:
                    tmp_msa['dataset'] = meta['dataset']
                except:
                    tmp_msa['dataset'] = infile.replace('.csv', '')

                tmp_msa['seq_id'] = keys['id']

                # add consensus string to msa, if it appears in the keys
                if "consensus" in keys:
                    tmp_msa['consensus'] = keys['consensus']

                msad = []
                for l in tmp:
                    if not l.startswith(comment):
                        msad.append([x.strip().rstrip('.') for x in l.split('\t')])
                tmp_msa = _list2msa(msad, header=False, ids=True, **tmp_msa)

                try:
                    meta['msa'][ref][int(keys['id'])] = tmp_msa
                except ValueError:
                    meta['msa'][ref][keys['id']] = tmp_msa

            elif dtype == 'dst':
                taxa, matrix = read_dst(tmp)
                distances = [[0.0 for _ in matrix] for _ in matrix]
                for i, line in enumerate(matrix):
                    for j, cell in enumerate(line):
                        if i < j:
                            distances[i][j] = cell
                            distances[j][i] = cell
                meta['distances'] = distances
            elif dtype == 'scorer':
                scorer = read_scorer(tmp)
                if 'scorer' not in meta:
                    meta['scorer'] = {}
                if 'id' not in keys:
                    keys['id'] = 'basic'
                meta['scorer'][keys['id']] = scorer

            elif dtype == 'taxa':
                meta['taxa'] = [t.strip() for t in tmp.split('\n')]
        else:
            data += [[l.strip() for l in line.split('\t')]]

    # create the dictionary in which the data will be stored
    d = {}

    # check for first line, if a local ID is given in the header (or simply
    # "ID"), take this line as the ID, otherwise create it
    local_id = data[0][0].lower() in ['id', 'local_id', 'localid']

    # iterate over data and fill the dictionary (a bit inefficient, but enough
    # for the moment)
    try:
        i = 1
        for j, line in enumerate(data[1:]):
            if local_id:
                d[int(line[0])] = line[1:]
            else:
                d[i] = line
                i += 1
    except ValueError as e:
        raise Exception("Error processing line {0}:\n".format(j) +
                        str(data[1:][j]) + '\nOriginal error message: ' + str(e))

    # assign the header to d[0]
    if local_id:
        d[0] = [x.lower() for x in data[0][1:]]
    else:
        d[0] = [x.lower() for x in data[0]]

    for m in meta:
        d[m] = meta[m]

    if 'trees' in d and 'tree' not in d:
        d['tree'] = sorted(d['trees'].items(), key=lambda x: x[0])[0][1]

    return d
Exemplo n.º 20
0
    def __init__(self, filename, conf=''):
        """
        Parse data regularly if the data has not been loaded from a pickled version.
        """
        self.log = log.get_logger()

        # try to load the data
        internal_import = False

        # check whether it's a dictionary from which we load
        if isinstance(filename, dict):
            input_data = filename
            if 'filename' not in input_data:
                self.filename = rcParams['filename']
            internal_import = True
            # make check for correct input, there was a bug with a wrong
            # evaluation which is hopefully fixed by now
            tmp_keys = [k for k in input_data if isinstance(k, int)]
            if len(input_data[0]) != len(input_data[tmp_keys[0]]):
                log.warning(input_data[0], input_data[tmp_keys[0]])
                raise ValueError("[!] Wrong input format!")  # pragma: no cover
        # check whether it's another wordlist-object
        elif hasattr(filename, '_data') and hasattr(filename, '_meta'):
            input_data = dict([(key, [v for v in value]) for key, value in \
                    filename._data.items()])
            input_data.update(filename._meta.items())
            input_data[0] = [
                a for a, b in sorted(
                    filename.header.items(), key=lambda x: x[1], reverse=False)
            ]
            internal_import = True
            self.filename = rcParams['filename']
        # or whether the data is an actual file
        elif isinstance(filename, string_types) and os.path.isfile(filename):
            input_data = read_qlc(filename)
            self.filename = filename
        # raise an error otherwise
        elif isinstance(filename, string_types):
            raise IOError("Input file '{0}' does not exist.".format(filename))
        else:
            raise TypeError(
                "Unrecognized type for 'filename' argument: {0}".format(
                    type(filename).__name__))

        self._alias, self._class, self._class_string, self._alias2 = read_conf(
            conf)
        for name in input_data[0]:
            if name.lower() not in self._alias:
                self._alias[name.lower()] = name.lower()
                self._class[name.lower()] = str
            if name.upper() not in self._alias:
                self._alias[name.upper()] = name.lower()
                self._class[name.upper()] = str

        # add empty alias for empty strings XXX why was that? I can't remember
        # why this was important XXX
        self._alias[''] = ''

        # the header stores the indices of the data in the original data dictionary
        self.header = dict(
            zip([self._alias[x] for x in input_data[0]],
                range(len(input_data[0]))))

        # now create a specific header which has all aliases
        self._header = {k: v for k, v in self.header.items()}

        # add a sorted header for reference
        self.columns = sorted(self.header, key=lambda x: self.header[x])

        # assign all aliases to the header
        for alias in self._alias:
            try:
                self._header[alias] = self._header[self._alias[alias]]
            except:
                pass

        # assign the data as attribute to the word list class. Note that we
        # need to check for the type here, but since numpy also offers integer
        # types, we don't check for type(x) == int, but instead use the
        # str.numeric-function that returns numeric values only if it is an
        # integer
        self._data = {
            int(k): v
            for k, v in input_data.items() if k != 0 and str(k).isnumeric()
        }
        # check for same length of all columns
        check_errors = ''
        for k, v in self._data.items():
            if len(v) != len(self.header):
                check_errors += 'Row {0} in your data contains {1} fields (expected {2})\n'.format(
                    k, len(v), len(self.header))
        if check_errors:
            raise ValueError(check_errors + '\n' +
                             ', '.join(sorted(self.header)))

        # iterate over self._data and change the values according to the
        # functions (only needed when reading from file)
        if not internal_import:
            heads = sorted(self._header.items(), key=lambda x: x[1])
            for key in self._data:
                check = []
                for head, i in heads:
                    if i not in check:
                        logstring = 'Problem with row {0} in col {1}, expected' + \
                                    ' «{4}» as datatype but received «{3}» ' + \
                                    ' (ROW: {2}, entry {5}).'
                        try:
                            self._data[key][i] = self._class[head](
                                self._data[key][i])
                            check.append(i)
                        except KeyError:
                            log.warning(
                                logstring.format(
                                    key, i,
                                    '|'.join([str(x) for x in self._data[key]
                                              ]), self._data[key][i],
                                    self._class[head], head))
                        except ValueError:
                            log.warning(
                                logstring.format(
                                    key, i,
                                    '|'.join([str(x) for x in self._data[key]
                                              ]), self._data[key][i],
                                    self._class[head], head))

        # create entry attribute of the wordlist
        self.entries = sorted(
            set([b.lower() for a, b in self._alias.items() if b]))

        # assign meta-data
        self._meta = {}
        for key in [k for k in input_data if type(k) != int]:
            self._meta[key] = input_data[key]
Exemplo n.º 21
0
    def from_cldf(
            cls, 
            path,
            columns=(
                'parameter_id',
                'concept_name',
                'language_id',
                'language_name',
                'value',
                'form',
                'segments',
                'language_glottocode',
                'concept_concepticon_id',
                'language_latitude',
                'language_longitude',
                'cognacy'
                ),
            namespace=(
               ('concept_name', 'concept'),
               ('language_id', 'doculect'),
               ('segments', 'tokens'),
               ('language_glottocode', 'glottolog'),
               ('concept_concepticon_id', 'concepticon'),
               ('language_latitude', 'latitude'),
               ('language_longitude', 'longitude'),
               ('cognacy', 'cognacy'),
               ('cogid_cognateset_id', 'cogid')
               ),
            filter=lambda row: row["form"],
            **kwargs):
        """Load a CLDF dataset.

        Open a CLDF Dataset – with metadata or metadata-free – (only Wordlist
        datasets are supported for now, because other modules don't seem to
        make sense for LingPy) and transform it into this Class. Columns from
        the FormTable are imported in lowercase, columns from LanguageTable,
        ParameterTable and CognateTable are prefixed with `langage_`,
        `concept_` and `cogid_`and converted to lowercase.

        Notes
        -----
        CLDFs default column names for wordlists are different from LingPy's,
        so you probably have to use::

        >>> lingpy.Wordlist.from_cldf(
            "Wordlist-metadata.json",
            )

        in order to avoid errors from LingPy not finding required columns.

        Parameters
        ----------
        columns: list or tuple 
          The list of columns to import. (default: all columns)

        filter: function: rowdict → bool
          A condition function for importing only some rows. (default: lambda row: row["form"])

        All other parameters are passed on to the `cls`

        Returns
        -------
        A `cls` object representing the CLDF dataset

        """
        kw = {
                'row': 'concept',
                'col': 'doculect',
                'conf': util.data_path('conf', 'wordlist.rc'),
                }
        kwargs.update(kw)
        
        if isinstance(namespace, tuple):
            namespace = dict(namespace)

        # get the datatypes from configuration as to namespace
        datatypes = read_conf(kwargs['conf'])[1]

        # Load the dataset.
        fname = Path(path)
        if not fname.exists():
            raise FileNotFoundError('{:} does not exist'.format(fname))
        if fname.suffix == '.json':
            dataset = pycldf.dataset.Dataset.from_metadata(fname)
        else:
            dataset = pycldf.dataset.Dataset.from_data(fname)

        if dataset.module == "Wordlist":
            # First, make a list of cognate codes if they are in a separate table.
            cognateset_assignments = {}
            try:
                form_reference = dataset["CognateTable", "formReference"].name
                for row in dataset["CognateTable"].iterdicts():
                    cognateset_assignments[row[form_reference]] = row
            except KeyError:
                # Either there are no cognate codes, or they are in the form
                # table. Both options are fine.
                pass

            f_id = dataset["FormTable", "id"].name

            # Access columns by type, not by name.
            language_column = dataset["FormTable", "languageReference"].name
            parameter_column = dataset["FormTable", "parameterReference"].name

            try:
                l_id = dataset["LanguageTable", "id"].name
                languages = {l[l_id]: l
                            for l in dataset["LanguageTable"].iterdicts()}
            except KeyError:
                l_id = "ID"
                languages = bounce_as_id

            try:
                c_id = dataset["ParameterTable", "id"].name
                concepts = {c[c_id]: c
                            for c in dataset["ParameterTable"].iterdicts()}
            except KeyError:
                c_id = "ID"
                concepts = bounce_as_id


            # create dictionary
            D = {0: columns} # Reserve the header
            for row in dataset["FormTable"].iterdicts():
                # TODO: Improve prefixing behaviour
                s = {"cogid_{:}".format(key).lower(): value
                     for key, value in cognateset_assignments.get(
                             row[f_id], {}).items()}
                s.update(
                    {"language_{:}".format(key).lower(): value
                     for key, value in languages[row[language_column]].items()})
                s.update(
                    {"concept_{:}".format(key).lower(): value
                     for key, value in concepts[row[parameter_column]].items()})
                s.update({k.lower(): v for k, v in row.items()})

                if not filter(s):
                    continue

                # check for numeric ID
                try:
                    idx = int(row[f_id])
                except ValueError:
                    idx = len(D)
                while idx in D:
                    idx += 1

                if not D[0]:
                    columns = list(s.keys())
                    D[0] = [c.lower() for c in columns]

                D[idx] = [datatypes.get(
                    namespace.get(
                        column,
                        ''),
                    lambda x: x)(
                    s.get(column, '')) for column in columns]
            D[0] = [namespace.get(c, c) for c in columns]
            if len(D[0]) != len(set(D[0])):
                log.warning('|'.join(columns))
                log.warning('|'.join(D[0]))
                raise ValueError('name space clashes, cannot parse data')

            # convert to wordlist and return
            return cls(D, **kwargs)
        else:
            # For most LingPy applications, it might be best to see whether we got
            # a Wordlist module.
            raise ValueError("LingPy has no procedures for CLDF {:} data.".format(
                dataset.module))
Exemplo n.º 22
0
def read_qlc(infile, comment='#'):
    """
    Simple function that loads qlc-format into a dictionary.

    Parameters
    ----------
    infile : str
        The name of the input file.
    comment : str (default="#")
        The comment character. If a line starts with this character, it will be
        ignored.

    Returns
    -------
    d : dict
        A dictionary with integer keys corresponding to the order of the lines
        of the input file. The header is given 0 as a specific key.
    """
    lines = read_text_file(infile, lines=True, normalize="NFC")
    data, meta, dtype = [], {}, False

    while lines:
        line = lines.pop(0)
        if line.startswith(comment) or not line:
            continue

        if line.startswith('@'):
            key, value = [s.strip() for s in line[1:].split(':', 1)]
            if key == 'tree':
                meta["tree"] = cg.LoadTree(treestring=value)
            elif key == 'json':
                for j1, j2 in json.loads(value).items():
                    meta[j1] = j2
            else:
                if key not in meta:
                    meta[key] = value
                else:
                    if isinstance(meta[key], list):
                        meta[key].append(value)
                    else:
                        log.warning(
                            "Key '{0}' in input file is not unique! Use JSON-format for "
                            "these datatypes!".format(key))
                        meta[key] = [meta[key]] + [value]
        # line starts with complex stuff
        elif line.startswith('<'):
            tmp = line[1:line.index('>')]
            # check for specific keywords
            if ' ' in tmp:
                dtype = tmp.split(' ')[0]
                keys = {
                    k: v[1:-1]
                    for k, v in [key.split('=') for key in tmp.split(' ')[1:]]
                }
            else:
                dtype = tmp.strip()
                keys = {}

            tmp = []

            while True:
                line = lines.pop(0)
                if line.startswith('</' + dtype + '>'):
                    break
                tmp += [line]

            tmp = '\n'.join(tmp)

            # check for data stuff
            if dtype == "json":
                tmp = json.loads(tmp)
                if not keys:
                    for key in tmp:
                        meta[key] = tmp[key]
                elif keys:
                    meta[keys["id"]] = {}
                    for k in tmp:
                        meta[keys["id"]][k] = tmp[k]
            elif dtype in ['tre', 'nwk']:
                if "trees" not in meta:
                    meta["trees"] = {}

                if not keys:
                    keys["id"] = "1"

                # XXX consider switching to Tree here XXX
                meta['trees'][keys["id"]] = cg.LoadTree(treestring=tmp)
            elif dtype in ['csv']:
                meta[keys["id"]] = {}
                ncol = int(keys.get('ncol', 2))

                if "dtype" in keys:
                    transf = eval(keys["dtype"])
                else:
                    transf = str

                # split tmp into lines
                tmp = tmp.split('\n')
                for l in tmp:
                    if ncol == 2:
                        a, b = l.split('\t')
                        b = transf(b)
                    else:
                        l = l.split('\t')
                        a = l[0]
                        b = [transf(b) for b in l[1:]]
                    meta[keys["id"]][a] = b
            elif dtype == 'msa':
                tmp = tmp.split('\n')
                if 'msa' not in meta:
                    meta['msa'] = {}

                ref = keys.get('ref', 'cogid')
                if ref not in meta['msa']:
                    meta['msa'][ref] = {}

                tmp_msa = {}
                try:
                    tmp_msa['dataset'] = meta['dataset']
                except:
                    tmp_msa['dataset'] = infile.replace('.csv', '')

                tmp_msa['seq_id'] = keys['id']

                # add consensus string to msa, if it appears in the keys
                if "consensus" in keys:
                    tmp_msa['consensus'] = keys['consensus']

                msad = []
                for l in tmp:
                    if not l.startswith(comment):
                        msad.append(
                            [x.strip().rstrip('.') for x in l.split('\t')])
                tmp_msa = _list2msa(msad, header=False, ids=True, **tmp_msa)

                try:
                    meta['msa'][ref][int(keys['id'])] = tmp_msa
                except ValueError:
                    meta['msa'][ref][keys['id']] = tmp_msa

            elif dtype == 'dst':
                taxa, matrix = read_dst(tmp)
                distances = [[0.0 for _ in matrix] for _ in matrix]
                for i, line in enumerate(matrix):
                    for j, cell in enumerate(line):
                        if i < j:
                            distances[i][j] = cell
                            distances[j][i] = cell
                meta['distances'] = distances
            elif dtype == 'scorer':
                scorer = read_scorer(tmp)
                if 'scorer' not in meta:
                    meta['scorer'] = {}
                keys.setdefault('id', 'basic')
                meta['scorer'][keys['id']] = scorer

            elif dtype == 'taxa':
                meta['taxa'] = [t.strip() for t in tmp.split('\n')]
        else:
            data += [[l.strip() for l in line.split('\t')]]

    # create the dictionary in which the data will be stored
    d = {}

    # check for first line, if a local ID is given in the header (or simply
    # "ID"), take this line as the ID, otherwise create it
    local_id = data[0][0].lower() in ['id', 'local_id', 'localid']

    # iterate over data and fill the dictionary (a bit inefficient, but enough
    # for the moment)
    try:
        i = 1
        for j, line in enumerate(data[1:]):
            if local_id:
                d[int(line[0])] = line[1:]
            else:
                d[i] = line
                i += 1
    except ValueError as e:  # pragma: no cover
        raise Exception("Error processing line {0}:\n".format(j) +
                        str(data[1:][j]) + '\nOriginal error message: ' +
                        str(e))

    # assign the header to d[0]
    if local_id:
        d[0] = [x.lower() for x in data[0][1:]]
    else:
        d[0] = [x.lower() for x in data[0]]

    for m in meta:
        d[m] = meta[m]

    if 'trees' in d and 'tree' not in d:
        d['tree'] = sorted(d['trees'].items(), key=lambda x: x[0])[0][1]

    return d
Exemplo n.º 23
0
def pid(almA, almB, mode=2):
    """
    Calculate the Percentage Identity (PID) score for aligned sequence pairs.

    Parameters
    ----------

    almA, almB : string or list
        The aligned sequences which can be either a string or a list.

    mode : { 1, 2, 3, 4, 5 }
        Indicate which of the four possible PID scores described in :evobib:`Raghava2006`
        should be calculated, the fifth possibility is added for linguistic
        purposes:

        1. identical positions / (aligned positions + internal gap positions),

        2. identical positions / aligned positions,

        3. identical positions / shortest sequence, or

        4. identical positions / shortest sequence (including internal gap
           pos.)

        5. identical positions / (aligned positions + 2 * number of gaps)

    Returns
    -------

    score : float
        The PID score of the given alignment as a floating point number between
        0 and 1.

    Notes
    -----

    The PID score is a common measure for the diversity of a given alignment.
    The implementation employed by LingPy follows the description of
    :evobib:`Raghava2006` where four different variants of PID scores are
    distinguished. Essentially, the PID score is based on the comparison of
    identical residue pairs with the total number of residue pairs in a given
    alignment.

    Examples
    --------
    Load an alignment from the test suite.

    >>> from lingpy import *
    >>> pairs = PSA(get_file('test.psa'))

    Extract the alignments of the first aligned sequence pair.

    >>> almA,almB,score = pairs.alignments[0]

    Calculate the PID score of the alignment.

    >>> pid(almA,almB)
    0.44444444444444442

    See also
    --------
    lingpy.compare.Multiple.get_pid

    .. todo:: change debug for ZeroDivisionError

    """

    zipped = zip(almA, almB)
    idn_pos = 0
    int_gps = 0
    aln_pos = 0

    for charA, charB in zipped:
        tmp = [charA, charB].count('-')
        if tmp == 1:
            int_gps += 1
        elif tmp == 0 and charA == charB:
            idn_pos += 1
            aln_pos += 1
        elif tmp == 0:
            aln_pos += 1

    if mode == 2:
        try:
            return idn_pos / (aln_pos + int_gps)
        except ZeroDivisionError:
            log.warning('Zero Division Error in {0} and {1}'.format(almA, almB))
            return 0

    elif mode == 1:
        try:
            return idn_pos / aln_pos
        except ZeroDivisionError:
            log.warning('Zero Division Error in {0} and {1}'.format(almA, almB))
            return 0

    elif mode == 3:
        srt_seq = min(
            len([i for i in almA if i != '-']), len([i for i in almB if i != '-']))
        try:
            return idn_pos / srt_seq
        except ZeroDivisionError:
            log.warning('Zero Division Error in {0} and {1}'.format(almA, almB))
            return 0

    elif mode == 4:
        srt_seq = min(
            len(''.join([i[0] for i in almA]).strip('-')),
            len(''.join([i[0] for i in almB]).strip('-')))
        try:
            return idn_pos / srt_seq
        except ZeroDivisionError:
            log.warning('Zero Division Error in {0} and {1}'.format(almA, almB))
            return 0

    elif mode == 5:
        return idn_pos / len(almA)
Exemplo n.º 24
0
    def get_partial_scorer(self, **keywords):
        """
        Create a scoring function based on sound correspondences.

        Parameters
        ----------
        method : str (default='shuffle')
            Select between "markov", for automatically generated random
            strings, and "shuffle", for random strings taken directly from the
            data.
        ratio : tuple (default=3,2)
            Define the ratio between derived and original score for
            sound-matches.
        vscale : float (default=0.5)
            Define a scaling factor for vowels, in order to decrease their
            score in the calculations.
        runs : int (default=1000)
            Choose the number of random runs that shall be made in order to
            derive the random distribution.
        threshold : float (default=0.7)
            The threshold which used to select those words that are compared
            in order to derive the attested distribution.
        modes : list (default = [("global",-2,0.5),("local",-1,0.5)])
            The modes which are used in order to derive the distributions from
            pairwise alignments.
        factor : float (default=0.3)
            The scaling factor for sound segments with identical prosodic
            environment.
        force : bool (default=False)
            Force recalculation of existing distribution.
        preprocessing: bool (default=False)
            Select whether SCA-analysis shall be used to derive a preliminary
            set of cognates from which the attested distribution shall be
            derived.
        rands : int (default=1000)
            If "method" is set to "markov", this parameter defines the number
            of strings to produce for the calculation of the random
            distribution.
        limit : int (default=10000)
            If "method" is set to "markov", this parameter defines the limit
            above which no more search for unique strings will be carried out.
        cluster_method : {"upgma" "single" "complete"} (default="upgma")
            Select the method to be used for the calculation of cognates in the
            preprocessing phase, if "preprocessing" is set to c{True}.
        gop : int (default=-2)
            If "preprocessing" is selected, define the gap opening penalty for
            the preprocessing calculation of cognates.
        unattested : {int, float} (default=-5)
            If a pair of sounds is not attested in the data, but expected by
            the alignment algorithm that computes the expected distribution,
            the score would be -infinity. Yet in order to allow to smooth this
            behaviour and to reduce the strictness, we set a default negative
            value which does not necessarily need to be too high, since it may
            well be that we miss a potentially good pairing in the first runs
            of alignment analyses. Use this keyword to adjust this parameter.
        unexpected : {int, float} (default=0.000001)
            If a pair is encountered in a given alignment but not expected
            according to the randomized alignments, the score would be not
            calculable, since we had to divide by zero. For this reason, we set
            a very small constant, by which the score is divided in this case.
            Not that this constant is only relevant in those cases where the
            shuffling procedure was not carried out long enough.

        """
        kw = dict(
            method=rcParams['lexstat_scoring_method'],
            ratio=rcParams['lexstat_ratio'],
            vscale=rcParams['lexstat_vscale'],
            runs=rcParams['lexstat_runs'],
            threshold=rcParams['lexstat_scoring_threshold'],
            modes=rcParams['lexstat_modes'],
            factor=rcParams['align_factor'],
            restricted_chars=rcParams['restricted_chars'],
            force=False,
            preprocessing=False,
            rands=rcParams['lexstat_rands'],
            limit=rcParams['lexstat_limit'],
            cluster_method=rcParams['lexstat_cluster_method'],
            gop=rcParams['align_gop'],
            preprocessing_threshold=rcParams[
                'lexstat_preprocessing_threshold'],
            preprocessing_method=rcParams['lexstat_preprocessing_method'],
            subset=False,
            defaults=False,
            unattested=-5,
            unexpected=0.00001,
            smooth=1)
        kw.update(keywords)
        if kw['defaults']:
            return kw

        # get parameters and store them in string
        params = dict(
            ratio=kw['ratio'],
            vscale=kw['vscale'],
            runs=kw['runs'],
            scoring_threshold=kw['threshold'],
            preprocessing_threshold=kw['preprocessing_threshold'],
            modestring=':'.join('{0}-{1}-{2:.2f}'.format(a, abs(b), c)
                                for a, b, c in kw['modes']),
            factor=kw['factor'],
            restricted_chars=kw['restricted_chars'],
            method=kw['method'],
            preprocessing='{0}:{1}:{2}'.format(kw['preprocessing'],
                                               kw['cluster_method'],
                                               kw['gop']),
            unattested=kw['unattested'],
            unexpected=kw['unexpected'],
            smooth=kw['smooth'])

        parstring = '_'.join([
            '{ratio[0]}:{ratio[1]}', '{vscale:.2f}', '{runs}',
            '{scoring_threshold:.2f}', '{modestring}', '{factor:.2f}',
            '{restricted_chars}', '{method}', '{preprocessing}',
            '{preprocessing_threshold}', '{unexpected:.2f}', '{unattested:.2f}'
        ]).format(**params)

        # check for existing attributes
        if hasattr(self, 'cscorer') and not kw['force']:
            log.warning(
                "An identical scoring function has already been calculated, "
                "force recalculation by setting 'force' to 'True'.")
            return

        # check for attribute
        if hasattr(self, 'params') and not kw['force']:
            if 'cscorer' in self.params:
                if self.params['cscorer'] == params:
                    log.warning(
                        "An identical scoring function has already been "
                        "calculated, force recalculation by setting 'force'"
                        " to 'True'.")
                    return
            else:
                log.warning(
                    "A different scoring function has already been calculated, "
                    "overwriting previous settings.")

        # store parameters
        self.params = {'cscorer': params}
        self._meta['params'] = self.params
        self._stamp += "# Parameters: " + parstring + '\n'

        # get the correspondence distribution
        self._corrdist = self._get_partial_corrdist(**kw)
        # get the random distribution
        self._randist = self._get_partial_randist(**kw)

        # get the average gop
        gop = sum([m[1] for m in kw['modes']]) / len(kw['modes'])

        # create the new scoring matrix
        matrix = [[c for c in line] for line in self.bscorer.matrix]
        char_dict = self.bscorer.chars2int

        for (i, tA), (j, tB) in util.multicombinations2(enumerate(self.cols)):
            for charA, charB in product(
                    list(self.freqs[tA]) + [util.charstring(i + 1)],
                    list(self.freqs[tB]) + [util.charstring(j + 1)]):
                exp = self._randist.get((tA, tB), {}).get((charA, charB),
                                                          False)
                att = self._corrdist.get((tA, tB), {}).get((charA, charB),
                                                           False)
                # in the following we follow the former lexstat protocol
                if att <= kw['smooth'] and i != j:
                    att = False

                if att and exp:
                    score = np.log2((att**2) / (exp**2))
                elif att and not exp:
                    score = np.log2((att**2) / kw['unexpected'])
                elif exp and not att:
                    score = kw['unattested']  # XXX gop ???
                else:  # elif not exp and not att:
                    score = -90  # ???

                # combine the scores
                if rcParams['gap_symbol'] not in charA + charB:
                    sim = self.bscorer[charA, charB]
                else:
                    sim = gop

                # get the real score
                rscore = (kw['ratio'][0] * score + kw['ratio'][1] * sim) \
                    / sum(kw['ratio'])

                try:
                    iA = char_dict[charA]
                    iB = char_dict[charB]

                    # use the vowel scale
                    if charA[4] in self.vowels and charB[4] in self.vowels:
                        matrix[iA][iB] = matrix[iB][iA] = kw['vscale'] * rscore
                    else:
                        matrix[iA][iB] = matrix[iB][iA] = rscore
                except:
                    pass

        self.cscorer = misc.ScoreDict(self.chars, matrix)
        self._meta['scorer']['cscorer'] = self.cscorer
Exemplo n.º 25
0
def psa2html(infile, **kw):
    """
    Function converts a PSA-file into colored html-format.
    """
    util.setdefaults(kw,
                     template=False,
                     css=False,
                     comment='#',
                     filename=infile[:-4] + '.html',
                     compact=True)

    template = util.read_text_file(kw['template'] or template_path('psa.html'))
    css = util.read_text_file(kw['css'] or template_path('psa.css'))

    data = []
    for line in util.read_text_file(infile, lines=True):
        if not line.startswith(kw['comment']):
            data.append(line)

    seq_ids = []
    pairs = []
    taxa = []
    alignments = []

    del data[0]

    i = 0
    while i <= len(data) - 3:
        try:
            seq_ids.append(data[i])

            datA = data[i + 1].split('\t')
            datB = data[i + 2].split('\t')

            taxonA = datA[0].strip('.')
            taxonB = datB[0].strip('.')
            almA = datA[1:]
            almB = datB[1:]

            taxa.append((taxonA, taxonB))
            pairs.append(('.'.join([k for k in almA if k != '-']),
                          '.'.join([k for k in almB if k != '-'])))
            alignments.append(
                ([str(a) for a in almA], [str(b) for b in almB], 0))
            assert len(alignments[-1][0]) == len(alignments[-1][1])
            i += 4
        except AssertionError:
            log.warning("Line {0} of the data is probably miscoded.".format(i +
                                                                            1))
            i += 1

    def get_classes(alm):
        classes = []
        residue = '<div class="residue {1}">{0}</div>'
        for j, char in enumerate(alm):
            if char == '-':
                d = 'dolgo_GAP'
            else:
                d = 'dolgo_' + token2class(char, rcParams['dolgo'])

                # bad check for three classes named differently
                if d == 'dolgo__':
                    d = 'dolgo_X'
                elif d == 'dolgo_1':
                    d = 'dolgo_TONE'
                elif d == 'dolgo_0':
                    d = 'dolgo_ERROR'
            classes += [residue.format(char, d)]
        return ''.join(classes)

    out = '<table>\n'  # codecs.open(kw['filename'], 'w', 'utf-8')
    for i, (a, b, c) in enumerate(alignments):
        clsA = get_classes(a)
        clsB = get_classes(b)

        ids = int(100 * pid(a, b) + 0.5)

        out += '<tr class="head">'
        out += '<td colspan=2 class="head"><b>Alignment {0}:</b> <i>{1}</i>, PID: {2}</td></tr>'.format(
            i + 1, seq_ids[i], ids)
        out += '<tr class="psa">'
        out += '<td class="taxon">{0}</td>'.format(taxa[i][0])
        out += '<td class="psa">{0}</td>'.format(clsA)
        out += '</tr>'
        out += '<tr class="psa">'
        out += '<td class="taxon">{0}</td>'.format(taxa[i][1])
        out += '<td class="psa">{0}</td>'.format(clsB)
        out += '</tr>'
        out += '<tr><td colspan=2></td></tr>'

    out += '</table>'

    html = template.format(alignments=out, css=css)

    if kw['compact']:
        html = html.replace('\n', ' ')
        html = re.sub(r'\s+', r' ', html)
        html = html.replace('> ', '>')
        html = html.replace(' >', '>')

    util.write_text_file(kw['filename'], html)
Exemplo n.º 26
0
def tokens2morphemes(tokens, **keywords):
    """
    Split a string into morphemes if it contains separators.

    Notes
    -----
    Function splits a list of tokens into subsequent lists of morphemes if the list
    contains morpheme separators. If no separators are found, but tonemarkers,
    it will still split the string according to the tones. If you want to avoid
    this behavior, set the keyword **split_on_tones** to False.

    Parameters
    ----------
    sep : str (default="◦")
        Select your morpheme separator.
    word_sep: str (default="_")
        Select your word separator.

    Returns
    -------
    morphemes : list
        A nested list of the original segments split into morphemes.
    """

    if not isinstance(tokens, (list, tuple)):
        raise ValueError("The sequence needs to be a list or a tuple.")

    kw = {
        "sep": rcParams['morpheme_separator'],
        "word_sep": rcParams['word_separator'],
        "word_seps": rcParams['word_separators'],
        "seps": rcParams['morpheme_separators'],
        "split_on_tones": True,
        "tone": "T",
        "cldf": False
    }
    kw.update(keywords)
    if not kw['split_on_tones']: kw['tone'] = ''

    # check for other hints than the clean separators in the data
    new_tokens = [t for t in tokens]
    if not kw['sep'] in tokens and not kw['word_sep'] in tokens:
        class_string = tokens2class(tokens, 'cv', cldf=kw['cldf'])
        if kw['tone'] in class_string \
                and '+' not in class_string and '_' not in class_string:
            new_tokens = []
            for i, token in enumerate(tokens):
                if class_string[i] == kw['tone'] and i != len(class_string) - 1:
                    new_tokens += [token, kw['sep']]
                else:
                    new_tokens += [token]
    out = [[]]
    for i, token in enumerate(new_tokens):
        if token not in kw['sep'] + kw['word_sep'] + kw['word_seps'] + kw['seps']:
            out[-1] += [token]
        else:
            out += [[]]
    # check for bad examples
    if ['' for x in out if not x]:
        log.warning("[!] Your data contains empty morpheme segments.")
        out = [x for x in out if x]

    return out
Exemplo n.º 27
0
    def __init__(self, filename, conf=''):
        """
        Parse data regularly if the data has not been loaded from a pickled version.
        """
        self.log = log.get_logger()

        # try to load the data
        internal_import = False

        # check whether it's a dictionary from which we load
        if isinstance(filename, dict):
            input_data = filename
            if 'filename' not in input_data:
                self.filename = rcParams['filename']
            internal_import = True
            # make check for correct input, there was a bug with a wrong
            # evaluation which is hopefully fixed by now
            tmp_keys = [k for k in input_data if isinstance(k, int)]
            if len(input_data[0]) != len(input_data[tmp_keys[0]]):
                print(input_data[0], input_data[tmp_keys[0]])
                raise ValueError("[!] Wrong input format!")  # pragma: no cover
        # check whether it's another wordlist-object
        elif hasattr(filename, '_data') and hasattr(filename, '_meta'):
            input_data = dict([(key, [v for v in value]) for key, value in \
                    filename._data.items()])
            input_data.update(filename._meta.items())
            input_data[0] = [a for a, b in sorted(
                filename.header.items(),
                key=lambda x: x[1],
                reverse=False)]
            internal_import = True
            self.filename = rcParams['filename']
        # or whether the data is an actual file
        elif isinstance(filename, string_types) and os.path.isfile(filename):
            input_data = read_qlc(filename)
            self.filename = filename
        # raise an error otherwise
        elif isinstance(filename, string_types):
            raise IOError("Input file '{0}' does not exist.".format(filename))
        else:
            raise TypeError("Unrecognized type for 'filename' argument: {0}".format(
                type(filename).__name__))

        # load the configuration file
        if not conf:
            conf = util.data_path('conf', 'qlc.rc')

        # read the file defined by its path in conf
        tmp = [line.split('\t') for line in util.read_config_file(conf)]

        # define two attributes, _alias, and _class which store the aliases and
        # the datatypes (classes) of the given entries
        self._alias, self._class, self._class_string, self._alias2 = {}, {}, {}, {}
        for name, cls, alias in tmp:
            # make sure the name itself is there
            self._alias[name.lower()] = self._alias[name.upper()] = name
            self._class[name.lower()] = self._class[name.upper()] = eval(cls)
            self._class_string[name.lower()] = self._class_string[name.upper()] = cls

            # add the aliases
            for a in alias.split(','):
                self._alias[a.lower()] = self._alias[a.upper()] = name
                self._class[a.lower()] = self._class[a.upper()] = eval(cls)
                self._class_string[a.lower()] = self._class_string[a.upper()] = cls

            self._alias2[name] = sorted(set(alias.split(','))) + [name]

        # append the names in data[0] to self.conf to make sure that all data
        # is covered, even the types which are not specifically defined in the
        # conf file. the datatype defaults here to "str"
        for name in input_data[0]:
            if name.lower() not in self._alias:
                self._alias[name.lower()] = name.lower()
                self._class[name.lower()] = str
            if name.upper() not in self._alias:
                self._alias[name.upper()] = name.lower()
                self._class[name.upper()] = str

        # add empty alias for empty strings XXX why was that? I can't remember
        # why this was important XXX
        self._alias[''] = ''

        # the header stores the indices of the data in the original data dictionary
        self.header = dict(
            zip([self._alias[x] for x in input_data[0]], range(len(input_data[0]))))

        # now create a specific header which has all aliases
        self._header = {k: v for k, v in self.header.items()}

        # add a sorted header for reference
        self.columns = sorted(self.header, key=lambda x: self.header[x])

        # assign all aliases to the header
        for alias in self._alias:
            try:
                self._header[alias] = self._header[self._alias[alias]]
            except:
                pass

        # assign the data as attribute to the word list class. Note that we
        # need to check for the type here, but since numpy also offers integer
        # types, we don't check for type(x) == int, but instead use the
        # str.numeric-function that returns numeric values only if it is an
        # integer
        self._data = {
            int(k): v for k, v in input_data.items() if k != 0 and str(k).isnumeric()}
        # check for same length of all columns
        check_errors = ''
        for k, v in self._data.items():
            if len(v) != len(self.header):
                check_errors += 'Row {0} in your data contains {1} fields (expected {2})\n'.format(
                    k, len(v), len(self.header))
        if check_errors:
            raise ValueError(check_errors + '\n' + ', '.join(sorted(self.header)))

        # iterate over self._data and change the values according to the
        # functions (only needed when reading from file)
        if not internal_import:
            heads = sorted(self._header.items(), key=lambda x: x[1])
            for key in self._data:
                check = []
                for head, i in heads:
                    if i not in check:
                        logstring = 'Problem with row {0} in col {1}, expected' + \
                                    ' «{4}» as datatype but received «{3}» ' + \
                                    ' (ROW: {2}, entry {5}).'
                        try:
                            self._data[key][i] = self._class[head](self._data[key][i])
                            check.append(i)
                        except KeyError:
                            log.warning(
                                logstring.format(
                                    key,
                                    i,
                                    '|'.join([str(x) for x in self._data[key]]),
                                    self._data[key][i],
                                    self._class[head],
                                    head))
                        except ValueError:
                            log.warning(
                                logstring.format(
                                    key,
                                    i,
                                    '|'.join([str(x) for x in self._data[key]]),
                                    self._data[key][i],
                                    self._class[head],
                                    head))

        # create entry attribute of the wordlist
        self.entries = sorted(set([b.lower() for a, b in self._alias.items() if b]))

        # assign meta-data
        self._meta = {}
        for key in [k for k in input_data if type(k) != int]:
            self._meta[key] = input_data[key]
Exemplo n.º 28
0
def tokens2morphemes(tokens, **keywords):
    """
    Split a string into morphemes if it contains separators.

    Notes
    -----
    Function splits a list of tokens into subsequent lists of morphemes if the list
    contains morpheme separators. If no separators are found, but tonemarkers,
    it will still split the string according to the tones. If you want to avoid
    this behavior, set the keyword **split_on_tones** to False.

    Parameters
    ----------
    sep : str (default="◦")
        Select your morpheme separator.
    word_sep: str (default="_")
        Select your word separator.

    Returns
    -------
    morphemes : list
        A nested list of the original segments split into morphemes.
    """

    if not isinstance(tokens, (list, tuple)):
        raise ValueError("The sequence needs to be a list or a tuple.")

    kw = {
        "sep": rcParams['morpheme_separator'],
        "word_sep": rcParams['word_separator'],
        "word_seps": rcParams['word_separators'],
        "seps": rcParams['morpheme_separators'],
        "split_on_tones": True,
        "tone": "T",
        "cldf": False
    }
    kw.update(keywords)
    if not kw['split_on_tones']: kw['tone'] = ''

    # check for other hints than the clean separators in the data
    new_tokens = [t for t in tokens]
    if not kw['sep'] in tokens and not kw['word_sep'] in tokens:
        class_string = tokens2class(tokens, 'cv', cldf=kw['cldf'])
        if kw['tone'] in class_string \
                and '+' not in class_string and '_' not in class_string:
            new_tokens = []
            for i, token in enumerate(tokens):
                if class_string[i] == kw['tone'] and i != len(class_string) - 1:
                    new_tokens += [token, kw['sep']]
                else:
                    new_tokens += [token]
    out = [[]]
    for i, token in enumerate(new_tokens):
        if token not in kw['sep'] + kw['word_sep'] + kw['word_seps'] + kw['seps']:
            out[-1] += [token]
        else:
            out += [[]]
    # check for bad examples
    if ['' for x in out if not x]:
        log.warning("[!] Your data contains empty morpheme segments.")
        out = [x for x in out if x]

    return out
Exemplo n.º 29
0
def plot_heatmap(wordlist,
                 filename="heatmap",
                 fileformat="pdf",
                 ref='cogid',
                 normalized=False,
                 refB='',
                 **keywords):
    """
    Create a heatmap-representation of shared cognates for a given wordlist.

    Parameters
    ----------
    wordlist : lingpy.basic.wordlist.Wordlist
        A Wordlist object containing cognate IDs.
    filename : str (default="heatmap")
        Name of the file to which the heatmap will be written.
    fileformat : str (default="pdf")
        A regular matplotlib-fileformat (pdf, png, pgf, svg).
    ref : str (default="cogid')
        The name of the column that contains the cognate identifiers.
    normalized : {bool str} (default=True)
        If set to c{False}, don't normalize the data. Otherwise, select the
        normalization method, choose between:
        
        * "jaccard" for the Jaccard-distance (see :evobib:`Bategelj1995` for
          details), and
        * "swadesh" for traditional lexicostatistical calculation of shared
          cognate percentages.

    cmap : matplotlib.cm (default=matplotlib.cm.jet)
        The color scheme to be used for the heatmap.
    steps : int (default=5)
        The number of steps in which names of taxa will be written to the axes.
    xrotation : int (default=45)
        The rotation of the taxon-names on the x-axis.
    colorbar : bool (default=True)
        Specify, whether a colorbar should be added to the plot.
    figsize : tuple (default=(10,10))
        Specify the size of the figure.
    tree : str (default='')
        A tree passed for the taxa in Newick-format. If no tree is specified,
        the method looks for a tree object in the Wordlist.

    Notes
    -----
    This function plots shared cognate percentages.

    """
    defaults = dict(
        bottom=0.01,  # rcParams['phybo_ylimb']
        cmap=mpl.cm.jet,
        colorbar=True,
        colorbar_label="Shared Cognates",
        colorbar_shrink=0.75,
        colorbar_textsize=10,
        figsize=(10, 5),
        height=0.8,
        labels={},  # taxon labels passed for the taxa,
        left=0.01,  # rcParams['phybo_xlimr'],
        matrix=False,
        normalization="jaccard",
        right=0.95,  # rcParams['phybo_xliml'],
        scale=0.075,
        show_tree=True,
        steps=20,
        textsize=5,
        top=0.95,  # rcParams['phybo_ylimt'],
        tree='',
        tree_bottom=0.1,
        tree_left=0.1,
        tree_width=0.2,
        vmax=1.0,
        vmin=0.0,
        width=0.8,
        xrotation=90,
        distances=False)
    for k in defaults:
        if k not in keywords:
            keywords[k] = defaults[k]

    # access the reference tree of the wordlist and create a function that
    # orders the taxa accordingly
    if not keywords['tree']:
        try:
            tree = wordlist.tree
        except:
            raise ValueError("[i] No tree could be found")
    else:
        tree = keywords["tree"]

    # check for normalization
    if normalized:
        if normalized not in ["jaccard", "swadesh"]:
            raise ValueError(
                "Keyword 'normalized' must be one of 'jaccard','swadesh',False."
            )

    # create an empty matrix
    if not normalized:
        matrix = np.zeros((wordlist.width, wordlist.width), dtype=int)
    else:
        matrix = np.zeros((wordlist.width, wordlist.width), dtype=float)

    # create the figure
    fig = plt.figure(figsize=keywords['figsize'])

    # plot the reference tree
    if keywords['show_tree']:
        tree_matrix, taxa = nwk2tree_matrix(tree)
        ax1 = fig.add_axes([
            keywords['left'], keywords['bottom'], 0.25 * keywords['width'],
            keywords['height']
        ])
        # [0.01,0.1,0.2,0.7])
        d = sch.dendrogram(
            np.array(tree_matrix),
            labels=[t for t in taxa],
            orientation='left',
        )
        taxa = d['ivl'][::-1]
        ax1.set_xticks([])
        ax1.set_yticks([])
        ax1.spines['bottom'].set_color('#ffffff')
        ax1.spines['top'].set_color('#ffffff')
        ax1.spines['left'].set_color('#ffffff')
        ax1.spines['right'].set_color('#ffffff')
        left = keywords['left'] + keywords['scale'] * keywords['width']

    else:
        left = keywords['left']
        taxa = tree.taxa

    # start iterating over taxa in order of the reference tree and fill in the
    # matrix with numbers of shared cognates
    if keywords['matrix']:
        matrix = keywords['matrix']
    else:
        for i, taxonA in enumerate(taxa):
            for j, taxonB in enumerate(taxa):
                if i < j:
                    if normalized in [False, "jaccard"]:
                        cogsA = wordlist.get_list(taxa=taxonA,
                                                  flat=True,
                                                  entry=ref)
                        cogsB = wordlist.get_list(taxa=taxonB,
                                                  flat=True,
                                                  entry=ref)

                        cogsA, cogsB = set(cogsA), set(cogsB)

                        shared = len(cogsA.intersection(cogsB))

                        if normalized:
                            shared = shared / len(cogsA.union(cogsB))
                    else:
                        cogsA = wordlist.get_dict(taxa=taxonA, entry=ref)
                        cogsB = wordlist.get_dict(taxa=taxonB, entry=ref)

                        shared = 0
                        slots = 0

                        # iterate over cognate sets in meaning slots
                        for key in cogsA.keys():
                            # check whether keys are present, we follow the
                            # STARLING procedure in ignoring missing data
                            if key in cogsA and key in cogsB:

                                # check for shared items
                                if [k for k in cogsA[key] if k in cogsB[key]]:
                                    shared += 1
                                slots += 1
                        try:
                            shared = shared / slots
                        except ZeroDivisionError:
                            log.warning(
                                str([
                                    shared, slots,
                                    len(cogsA),
                                    len(cogsB), taxonA, taxonB
                                ]))
                            shared = 0.0

                    matrix[i][j] = shared

                    # if refB is also a possibiltiy
                    if not refB:
                        matrix[j][i] = shared

                elif i > j and refB:
                    if normalized in [False, "jaccard"]:
                        cogsA = wordlist.get_list(taxa=taxonA,
                                                  flat=True,
                                                  entry=refB)
                        cogsB = wordlist.get_list(taxa=taxonB,
                                                  flat=True,
                                                  entry=refB)

                        cogsA, cogsB = set(cogsA), set(cogsB)

                        shared = len(cogsA.intersection(cogsB))

                        if normalized:
                            shared = shared / len(cogsA.union(cogsB))
                    else:
                        cogsA = wordlist.get_dict(taxa=taxonA, entry=refB)
                        cogsB = wordlist.get_dict(taxa=taxonB, entry=refB)

                        shared = 0
                        slots = 0

                        # iterate over cognate sets in meaning slots
                        for key in cogsA.keys():
                            # check whether keys are present, we follow the
                            # STARLING procedure in ignoring missing data
                            if key in cogsA and key in cogsB:

                                # check for shared items
                                if [k for k in cogsA[key] if k in cogsB[key]]:
                                    shared += 1
                                slots += 1
                        try:
                            shared = shared / slots
                        except ZeroDivisionError:
                            log.warning(
                                str([
                                    shared, slots,
                                    len(cogsA),
                                    len(cogsB), taxonA, taxonB
                                ]))
                            shared = 0.0

                    matrix[i][j] = shared

                elif i == j:
                    cogs = wordlist.get_list(taxa=taxonA, flat=True, entry=ref)
                    if normalized:
                        matrix[i][j] = 1.0
                    else:
                        matrix[i][j] = len(set(cogs))
    ax2 = fig.add_axes([
        left,  # keywords['left']+0.25 * keywords['width']+0.05,
        keywords['bottom'],
        keywords['width'],
        keywords['height']
    ])
    cmap = keywords['cmap']

    # [0.15,0.1,0.7,0.7])
    if 'distances' in keywords and keywords['distances']:
        for i, line in enumerate(matrix):
            for j, cell in enumerate(matrix):
                matrix[i][j] = 1 - matrix[i][j]
    nmatrix = [[keywords['vmax'], keywords['vmin']],
               [keywords['vmin'], keywords['vmax']]]

    im = ax2.matshow(nmatrix,
                     aspect='auto',
                     origin='lower',
                     interpolation='nearest',
                     cmap=keywords['cmap'],
                     vmax=keywords['vmax'],
                     vmin=keywords['vmin'])

    # set the xticks
    steps = int(len(taxa) / keywords['steps'] + 0.5)
    start = int(steps / 2 + 0.5)
    idxs = [0] + list(range(start, len(taxa), steps))
    selected_taxa = [taxa[i] for i in idxs]

    # modify taxon names if this is specified
    for i, t in enumerate(selected_taxa):
        if t in keywords['labels']:
            selected_taxa[i] = keywords['labels'][t]

    ax2.set_xticks([])
    ax2.set_yticks([])

    plt.xticks(idxs,
               selected_taxa,
               size=keywords['textsize'],
               rotation=keywords['xrotation'],
               rotation_mode="default")
    plt.yticks(
        idxs,
        selected_taxa,
        size=keywords['textsize'],
    )

    if keywords["colorbar"]:

        plt.imshow(matrix,
                   cmap=keywords['cmap'],
                   visible=False,
                   vmax=keywords['vmax'])
        c = plt.colorbar(im, shrink=keywords['colorbar_shrink'])
        c.set_label(keywords["colorbar_label"],
                    size=keywords['colorbar_textsize'])

    plt.subplots_adjust(left=keywords['left'],
                        right=keywords['right'],
                        top=keywords['top'],
                        bottom=keywords['bottom'])
    plt.savefig(filename + '.' + fileformat)

    f = open(filename + '.matrix', 'w')
    for i, t in enumerate(taxa):
        f.write('{0:20}'.format(t))
        for j, c in enumerate(matrix[i]):
            if not normalized:
                f.write('\t{0:3}'.format(int(c)))
            else:
                f.write('\t{0:.2f}'.format(c))
        f.write('\n')
    f.close()
    log.file_written(filename + '.' + fileformat)
Exemplo n.º 30
0
def pid(almA, almB, mode=2):
    """
    Calculate the Percentage Identity (PID) score for aligned sequence pairs.

    Parameters
    ----------

    almA, almB : string or list
        The aligned sequences which can be either a string or a list.

    mode : { 1, 2, 3, 4, 5 }
        Indicate which of the four possible PID scores described in :evobib:`Raghava2006`
        should be calculated, the fifth possibility is added for linguistic
        purposes:

        1. identical positions / (aligned positions + internal gap positions),

        2. identical positions / aligned positions,

        3. identical positions / shortest sequence, or

        4. identical positions / shortest sequence (including internal gap
           pos.)

        5. identical positions / (aligned positions + 2 * number of gaps)

    Returns
    -------

    score : float
        The PID score of the given alignment as a floating point number between
        0 and 1.

    Notes
    -----

    The PID score is a common measure for the diversity of a given alignment.
    The implementation employed by LingPy follows the description of
    :evobib:`Raghava2006` where four different variants of PID scores are
    distinguished. Essentially, the PID score is based on the comparison of
    identical residue pairs with the total number of residue pairs in a given
    alignment.

    Examples
    --------
    Load an alignment from the test suite.

    >>> from lingpy import *
    >>> pairs = PSA(get_file('test.psa'))

    Extract the alignments of the first aligned sequence pair.

    >>> almA,almB,score = pairs.alignments[0]

    Calculate the PID score of the alignment.

    >>> pid(almA,almB)
    0.44444444444444442

    See also
    --------
    lingpy.compare.Multiple.get_pid

    .. todo:: change debug for ZeroDivisionError

    """

    zipped = zip(almA, almB)
    idn_pos = 0
    int_gps = 0
    aln_pos = 0

    for charA, charB in zipped:
        tmp = [charA, charB].count('-')
        if tmp == 1:
            int_gps += 1
        elif tmp == 0 and charA == charB:
            idn_pos += 1
            aln_pos += 1
        elif tmp == 0:
            aln_pos += 1

    if mode == 2:
        try:
            return idn_pos / (aln_pos + int_gps)
        except ZeroDivisionError:
            log.warning('Zero Division Error in {0} and {1}'.format(almA, almB))
            return 0

    elif mode == 1:
        try:
            return idn_pos / aln_pos
        except ZeroDivisionError:
            log.warning('Zero Division Error in {0} and {1}'.format(almA, almB))
            return 0

    elif mode == 3:
        srt_seq = min(
            len([i for i in almA if i != '-']), len([i for i in almB if i != '-']))
        try:
            return idn_pos / srt_seq
        except ZeroDivisionError:
            log.warning('Zero Division Error in {0} and {1}'.format(almA, almB))
            return 0

    elif mode == 4:
        srt_seq = min(
            len(''.join([i[0] for i in almA]).strip('-')),
            len(''.join([i[0] for i in almB]).strip('-')))
        try:
            return idn_pos / srt_seq
        except ZeroDivisionError:
            log.warning('Zero Division Error in {0} and {1}'.format(almA, almB))
            return 0

    elif mode == 5:
        return idn_pos / len(almA)
Exemplo n.º 31
0
def context_profile(wordlist,
                    ref='ipa',
                    col="doculect",
                    semi_diacritics='hsʃ̢ɕʂʐʑʒw',
                    merge_vowels=False,
                    brackets=None,
                    splitters='/,;~',
                    merge_geminates=True,
                    clts=False,
                    bad_word="<???>",
                    bad_sound="<?>",
                    unknown_sound="!{0}",
                    examples=2,
                    max_entries=100,
                    normalization_form="NFC"):
    """
    Create an advanced Orthography Profile with context and doculect information.

    Parameters
    ----------
    wordlist : ~lingpy.basic.wordlist.Wordlist
        A wordlist from which you want to derive an initial
        orthography profile.
    ref : str (default="ipa")
        The name of the reference column in which the words are stored.
    col : str (default="doculect")
        Indicate in which column the information on the language variety is
        stored.
    semi_diacritics : str
        Indicate characters which can occur both as "diacritics" (second part
        in a sound) or alone.
    merge_vowels : bool (default=True)
        Indicate whether consecutive vowels should be merged.
    brackets : dict
        A dictionary with opening brackets as key and closing brackets as
        values. Defaults to a pre-defined set of frequently occurring brackets.
    splitters : str
        The characters which force the automatic splitting of an entry.
    clts : dict (default=None)
        A dictionary(like) object that converts a given source sound into a
        potential target sound, using the get()-method of the dictionary.
        Normally, we think of a CLTS instance here (that is: a cross-linguistic
        transcription system as defined in the pyclts package).
    bad_word : str (default="«???»")
        Indicate how words that could not be parsed should be handled. Note
        that both "bad_word" and "bad_sound" are format-strings, so you can add
        formatting information here.
    bad_sound : str (default="«?»")
        Indicate how sounds that could not be converted to a sound class be
        handled. Note that both "bad_word" and "bad_sound" are format-strings,
        so you can add formatting information here.
    unknown_sound : str (default="!{0}")
        If with_clts is set to True, use this string to indicate that sounds
        are classified as "unknown sound" in the CLTS framework.
    examples : int(default=2)
        Indicate the number of examples that should be printed out.

    Returns
    -------
    profile : generator
        A generator of tuples (three items), indicating the segment, its frequency,
        the conversion to sound classes in the Dolgopolsky sound-class model,
        and the unicode-codepoints.
    """
    clts_ = clts or {}
    nulls = set()
    bad_words = set()
    brackets = brackets or "([{『(₍⁽«)]})』⁾₎"
    profile = defaultdict(list)
    errors = set()
    for idx, word, language in pb(wordlist.iter_rows(ref, col),
                                  desc='iter words',
                                  total=len(wordlist)):
        log.info('processing {0}-{1}'.format(idx, word))
        if isinstance(word, list):
            word = ' '.join(word)
        word = unicodedata.normalize(normalization_form, word)
        if word.strip():
            try:
                cleaned_string = clean_string(
                    word,
                    semi_diacritics=semi_diacritics,
                    merge_vowels=merge_vowels,
                    brackets=None,
                    ignore_brackets=False,
                    normalization_form=normalization_form,
                    split_entries=False,
                    preparse=None,
                    rules=None,
                    merge_geminates=merge_geminates)[0].split(' ')

                # retain whole word if there are splitters in the word
                if [x for x in cleaned_string if x in brackets + splitters]:
                    profile[word] += [(language, word)]
                    bad_words.add(word)
                else:
                    context_pre = ['^'] + (len(cleaned_string) - 1) * ['']
                    context_post = (len(cleaned_string) - 1) * [''] + ['$']
                    for ctxA, ctxB, segment in zip(context_pre, context_post,
                                                   cleaned_string):
                        profile[ctxA + segment + ctxB] += [(language, word)]
                    for segment in [
                            x for x in word
                            if x not in ' '.join(cleaned_string)
                    ]:
                        if segment.strip():
                            profile[segment] += [(language, word)]
                            nulls.add(segment)
            except:
                errors.add(idx)
                log.warning('problem parsing {0}'.format(word))

    for s in '^$':
        yield s, 'NULL', '', '', '', ''

    for idx, (s, entries) in pb(enumerate(
            sorted(profile.items(), key=lambda x: len(x[1]), reverse=True)),
                                desc='yielding entries',
                                total=len(profile)):
        sclass = token2class(s.strip('^$'), 'dolgo')
        words, langs = [l[1] for l in entries
                        ][:max_entries], [l[0] for l in entries][:max_entries]
        languages = ', '.join(
            sorted(set(langs), key=lambda x: langs.count(x), reverse=True))
        frequency = str(len(langs))
        codepoints = codepoint(s)
        examples_ = ', '.join(
            sorted(set(words), key=lambda x: words.count(x),
                   reverse=True)[:examples])
        if s in bad_words:
            ipa = bad_word.format(s)
        elif sclass == '0':
            ipa = bad_sound.format(s)
        elif s in nulls:
            ipa = 'NULL'
        elif clts_:
            sound = clts_.get(s.strip('^$'), False)
            if not sound:
                ipa = '!' + s.strip('^$')
            else:
                ipa = str(sound)
        else:
            ipa = s.strip('^$')

        yield s, ipa, examples_, languages, frequency, codepoints
Exemplo n.º 32
0
def plot_heatmap(
    wordlist,
    filename="heatmap",
    fileformat="pdf",
    ref='cogid',
    normalized=False,
    refB='',
    **keywords
):
    """
    Create a heatmap-representation of shared cognates for a given wordlist.

    Parameters
    ----------
    wordlist : lingpy.basic.wordlist.Wordlist
        A Wordlist object containing cognate IDs.
    filename : str (default="heatmap")
        Name of the file to which the heatmap will be written.
    fileformat : str (default="pdf")
        A regular matplotlib-fileformat (pdf, png, pgf, svg).
    ref : str (default="cogid')
        The name of the column that contains the cognate identifiers.
    normalized : {bool str} (default=True)
        If set to c{False}, don't normalize the data. Otherwise, select the
        normalization method, choose between:
        
        * "jaccard" for the Jaccard-distance (see :evobib:`Bategelj1995` for
          details), and
        * "swadesh" for traditional lexicostatistical calculation of shared
          cognate percentages.

    cmap : matplotlib.cm (default=matplotlib.cm.jet)
        The color scheme to be used for the heatmap.
    steps : int (default=5)
        The number of steps in which names of taxa will be written to the axes.
    xrotation : int (default=45)
        The rotation of the taxon-names on the x-axis.
    colorbar : bool (default=True)
        Specify, whether a colorbar should be added to the plot.
    figsize : tuple (default=(10,10))
        Specify the size of the figure.
    tree : str (default='')
        A tree passed for the taxa in Newick-format. If no tree is specified,
        the method looks for a tree object in the Wordlist.

    Notes
    -----
    This function plots shared cognate percentages.

    """
    defaults = dict(
        bottom=0.01,  # rcParams['phybo_ylimb']
        cmap=mpl.cm.jet,
        colorbar=True,
        colorbar_label="Shared Cognates",
        colorbar_shrink=0.75,
        colorbar_textsize=10,
        figsize=(10, 5),
        height=0.8,
        labels={},  # taxon labels passed for the taxa,
        left=0.01,  # rcParams['phybo_xlimr'],
        matrix=False,
        normalization="jaccard",
        right=0.95,  # rcParams['phybo_xliml'],
        scale=0.075,
        show_tree=True,
        steps=20,
        textsize=5,
        top=0.95,  # rcParams['phybo_ylimt'],
        tree='',
        tree_bottom=0.1,
        tree_left=0.1,
        tree_width=0.2,
        vmax=1.0,
        vmin=0.0,
        width=0.8,
        xrotation=90,
        distances=False
    )
    for k in defaults:
        if k not in keywords:
            keywords[k] = defaults[k]

    # access the reference tree of the wordlist and create a function that
    # orders the taxa accordingly
    if not keywords['tree']:
        try:
            tree = wordlist.tree
        except:
            raise ValueError("[i] No tree could be found")
    else:
        tree = keywords["tree"]

    # check for normalization
    if normalized:
        if normalized not in ["jaccard", "swadesh"]:
            raise ValueError(
                "Keyword 'normalized' must be one of 'jaccard','swadesh',False.")

    # create an empty matrix
    if not normalized:
        matrix = np.zeros((wordlist.width, wordlist.width), dtype=int)
    else:
        matrix = np.zeros((wordlist.width, wordlist.width), dtype=float)

    # create the figure
    fig = plt.figure(figsize=keywords['figsize'])

    # plot the reference tree
    if keywords['show_tree']:
        tree_matrix, taxa = nwk2tree_matrix(tree)
        ax1 = fig.add_axes(
            [
                keywords['left'],
                keywords['bottom'],
                0.25 * keywords['width'],
                keywords['height']
            ]
        )
        # [0.01,0.1,0.2,0.7])
        d = sch.dendrogram(
            np.array(tree_matrix),
            labels=[t for t in taxa],
            orientation='left',

        )
        taxa = d['ivl'][::-1]
        ax1.set_xticks([])
        ax1.set_yticks([])
        ax1.spines['bottom'].set_color('#ffffff')
        ax1.spines['top'].set_color('#ffffff')
        ax1.spines['left'].set_color('#ffffff')
        ax1.spines['right'].set_color('#ffffff')
        left = keywords['left'] + keywords['scale'] * keywords['width']

    else:
        left = keywords['left']
        taxa = tree.taxa

    # start iterating over taxa in order of the reference tree and fill in the
    # matrix with numbers of shared cognates
    if keywords['matrix']:
        matrix = keywords['matrix']
    else:
        for i, taxonA in enumerate(taxa):
            for j, taxonB in enumerate(taxa):
                if i < j:
                    if normalized in [False, "jaccard"]:
                        cogsA = wordlist.get_list(
                            taxa=taxonA,
                            flat=True,
                            entry=ref
                        )
                        cogsB = wordlist.get_list(
                            taxa=taxonB,
                            flat=True,
                            entry=ref
                        )

                        cogsA, cogsB = set(cogsA), set(cogsB)

                        shared = len(cogsA.intersection(cogsB))

                        if normalized:
                            shared = shared / len(cogsA.union(cogsB))
                    else:
                        cogsA = wordlist.get_dict(
                            taxa=taxonA,
                            entry=ref
                        )
                        cogsB = wordlist.get_dict(
                            taxa=taxonB,
                            entry=ref
                        )

                        shared = 0
                        slots = 0

                        # iterate over cognate sets in meaning slots
                        for key in cogsA.keys():
                            # check whether keys are present, we follow the
                            # STARLING procedure in ignoring missing data
                            if key in cogsA and key in cogsB:

                                # check for shared items
                                if [k for k in cogsA[key] if k in cogsB[key]]:
                                    shared += 1
                                slots += 1
                        try:
                            shared = shared / slots
                        except ZeroDivisionError:
                            log.warning(str(
                                [shared, slots, len(cogsA), len(cogsB), taxonA, taxonB]))
                            shared = 0.0

                    matrix[i][j] = shared

                    # if refB is also a possibiltiy
                    if not refB:
                        matrix[j][i] = shared

                elif i > j and refB:
                    if normalized in [False, "jaccard"]:
                        cogsA = wordlist.get_list(
                            taxa=taxonA,
                            flat=True,
                            entry=refB
                        )
                        cogsB = wordlist.get_list(
                            taxa=taxonB,
                            flat=True,
                            entry=refB
                        )

                        cogsA, cogsB = set(cogsA), set(cogsB)

                        shared = len(cogsA.intersection(cogsB))

                        if normalized:
                            shared = shared / len(cogsA.union(cogsB))
                    else:
                        cogsA = wordlist.get_dict(
                            taxa=taxonA,
                            entry=refB
                        )
                        cogsB = wordlist.get_dict(
                            taxa=taxonB,
                            entry=refB
                        )

                        shared = 0
                        slots = 0

                        # iterate over cognate sets in meaning slots
                        for key in cogsA.keys():
                            # check whether keys are present, we follow the
                            # STARLING procedure in ignoring missing data
                            if key in cogsA and key in cogsB:

                                # check for shared items
                                if [k for k in cogsA[key] if k in cogsB[key]]:
                                    shared += 1
                                slots += 1
                        try:
                            shared = shared / slots
                        except ZeroDivisionError:
                            log.warning(str(
                                [shared, slots, len(cogsA), len(cogsB), taxonA, taxonB]))
                            shared = 0.0

                    matrix[i][j] = shared

                elif i == j:
                    cogs = wordlist.get_list(
                        taxa=taxonA,
                        flat=True,
                        entry=ref
                    )
                    if normalized:
                        matrix[i][j] = 1.0
                    else:
                        matrix[i][j] = len(set(cogs))
    ax2 = fig.add_axes(
        [
            left,  # keywords['left']+0.25 * keywords['width']+0.05,
            keywords['bottom'],
            keywords['width'],
            keywords['height']
        ]
    )
    cmap = keywords['cmap'] 

    # [0.15,0.1,0.7,0.7])
    if 'distances' in keywords and keywords['distances']:
        for i, line in enumerate(matrix):
            for j, cell in enumerate(matrix):
                matrix[i][j] = 1 - matrix[i][j]
    nmatrix = [
            [keywords['vmax'], keywords['vmin']],
            [keywords['vmin'], keywords['vmax']]
            ]

    im = ax2.matshow(nmatrix, aspect='auto', origin='lower',
                     interpolation='nearest', cmap=keywords['cmap'],
                     vmax=keywords['vmax'], vmin=keywords['vmin']
                     )

    # set the xticks
    steps = int(len(taxa) / keywords['steps'] + 0.5)
    start = int(steps / 2 + 0.5)
    idxs = [0] + list(range(start, len(taxa), steps))
    selected_taxa = [taxa[i] for i in idxs]

    # modify taxon names if this is specified
    for i, t in enumerate(selected_taxa):
        if t in keywords['labels']:
            selected_taxa[i] = keywords['labels'][t]

    ax2.set_xticks([])
    ax2.set_yticks([])



    plt.xticks(
        idxs,
        selected_taxa,
        size=keywords['textsize'],
        rotation=keywords['xrotation'],
        rotation_mode="default"
    )
    plt.yticks(
        idxs,
        selected_taxa,
        size=keywords['textsize'],
    )

    if keywords["colorbar"]:

        plt.imshow(matrix, cmap=keywords['cmap'], visible=False, vmax=keywords['vmax'])
        c = plt.colorbar(im, shrink=keywords['colorbar_shrink'])
        c.set_label(keywords["colorbar_label"], size=keywords['colorbar_textsize'])

    plt.subplots_adjust(
        left=keywords['left'],
        right=keywords['right'],
        top=keywords['top'],
        bottom=keywords['bottom']
    )
    plt.savefig(filename + '.' + fileformat)

    f = open(filename + '.matrix', 'w')
    for i, t in enumerate(taxa):
        f.write('{0:20}'.format(t))
        for j, c in enumerate(matrix[i]):
            if not normalized:
                f.write('\t{0:3}'.format(int(c)))
            else:
                f.write('\t{0:.2f}'.format(c))
        f.write('\n')
    f.close()
    log.file_written(filename + '.' + fileformat)