Exemplo n.º 1
0
    def evaluate_string(self, string, tokens=False, **keywords):
        setdefaults(keywords, stress=rcParams['stress'],
                diacritics=rcParams['diacritics'], cldf=False)
        if not tokens:
            tokens = ipa2tokens(string)
        score = 1
        dist = self.dist['#']

        prostring = prosodic_string(tokens, rcParams['art'], cldf=keywords['cldf'],
                        diacritics=keywords['diacritics'],
                        stress=keywords['stress'])
        if self.classes:
            c = tokens2class(tokens, self.model, cldf=keywords['cldf'],
                        diacritics=keywords['diacritics'],
                        stress=keywords['stress'])
            teststring = list(zip(prostring, c))
        else:
            teststring = list(zip(prostring, tokens))

        scores = []

        while len(teststring) > 0:
            segment = teststring.pop(0)
            freq = dist.count(segment)
            allf = len(dist)
            s = freq / allf
            score = score * s
            scores += [s]
            dist = self.dist[segment]
        score = score * s
        scores += [s]
        lscore = np.log10(score)
        lscore = lscore / len(tokens)
        return score, lscore  # np.log10(score)
Exemplo n.º 2
0
    def evaluate_string(self, string, tokens=False, **keywords):
        setdefaults(keywords, stress=rcParams['stress'],
                diacritics=rcParams['diacritics'], cldf=False)
        if not tokens:
            tokens = ipa2tokens(string)
        score = 1
        dist = self.dist['#']

        prostring = prosodic_string(tokens, rcParams['art'], cldf=keywords['cldf'],
                        diacritics=keywords['diacritics'],
                        stress=keywords['stress'])
        if self.classes:
            c = tokens2class(tokens, self.model, cldf=keywords['cldf'],
                        diacritics=keywords['diacritics'],
                        stress=keywords['stress'])
            teststring = list(zip(prostring, c))
        else:
            teststring = list(zip(prostring, tokens))

        scores = []

        while len(teststring) > 0:
            segment = teststring.pop(0)
            freq = dist.count(segment)
            allf = len(dist)
            s = freq / allf
            score = score * s
            scores += [s]
            dist = self.dist[segment]
        score = score * s
        scores += [s]
        lscore = np.log10(score)
        lscore = lscore / len(tokens)
        return score, lscore  # np.log10(score)
Exemplo n.º 3
0
    def __init__(self,
                 words,
                 tokens=False,
                 prostrings=[],
                 classes=False,
                 class_model=rcParams['model'],
                 **keywords):
        setdefaults(keywords,
                    stress=rcParams['stress'],
                    diacritics=rcParams['diacritics'],
                    cldf=False)
        self.model = class_model
        self.words = words
        self.tokens = []
        self.bigrams = []
        self.classes = []

        # start filling the dictionary
        for i, w in enumerate(words):

            # check for tokenized string
            if not tokens:
                tk = ipa2tokens(w, **keywords)
            else:
                tk = w[:]
            self.tokens += [tk]

            # create prosodic string
            if prostrings:
                p = prostrings[i]
            else:
                print(w, tk)
                tt = tokens2class(tk, rcParams['art'])
                print(tt)
                p = prosodic_string(tk,
                                    rcParams['art'],
                                    cldf=keywords['cldf'],
                                    diacritics=keywords['diacritics'],
                                    stress=keywords['stress'])
            # create classes
            if classes:
                c = tokens2class(tk,
                                 class_model,
                                 cldf=keywords['cldf'],
                                 diacritics=keywords['diacritics'],
                                 stress=keywords['stress'])
                bigrams = list(zip(p, c))
                self.classes += [c]
            else:
                # zip the stuff
                bigrams = list(zip(p, tk))

            # start appending the stuff
            self.bigrams += [bigrams]

            # init the mother object
            MCBasic.__init__(self, self.bigrams)
Exemplo n.º 4
0
def turchin(seqA, seqB, model='dolgo', **keywords):
    """
    Return cognate judgment based on the method by :evobib:`Turchin2010`.

    Parameters
    ----------
    seqA, seqB : {str, list, tuple}
        The input strings. These should be iterables, so you can use tuples,
        lists, or strings.
    model : {"asjp", "sca", "dolgo"} (default="dolgo")
        A sound-class model instance or a string that denotes one of the
        standard sound class models used in LingPy.

    Returns
    -------
    cognacy : {0, 1}
        The cognacy assertion which is either 0 (words are probably cognate) or
        1 (words are not likely to be cognate).

    """
    if text_type(model) == model:
        model = rcParams[model]
    elif hasattr(model, 'info'):
        pass
    else:
        raise ValueError("[!] No valid model instance selected.")

    if isinstance(seqA, (text_type, str)):
        seqA = ipa2tokens(seqA)
        seqB = ipa2tokens(seqB)

    classA = tokens2class(seqA, model)
    classB = tokens2class(seqB, model)

    if classA[0] in model.vowels:
        classA[0] = 'H'
    if classB[0] in model.vowels:
        classB[0] = 'H'

    if ''.join([k for k in classA if k not in model.vowels])[:2] == \
            ''.join([k for k in classB if k not in model.vowels])[:2]:
        return 0
    else:
        return 1
Exemplo n.º 5
0
def ipa_to_asjp(w):
    """
    Lingpy IPA-to-ASJP converter plus some cleanup.
    This function is called on IPA datasets.
    """
    w = w.replace('\"', '').replace('-', '').replace(' ', '')
    wA = ''.join(tokens2class(ipa2tokens(w, merge_vowels=False), 'asjp'))
    wAA = clean_asjp(wA.replace('0', '').replace('I', '3').replace('H', 'N'))
    asjp = ''.join([x for x in wAA if x in sounds])
    return asjp
Exemplo n.º 6
0
def i2t(s, merge_vowels: bool = True):
    tokens = ipa2tokens(s, merge_vowels=merge_vowels, merge_geminates=True)
    ret = list()
    for token in tokens:
        l = len(token)
        # NOTE(j_luo) Merge geminates into one segment.
        if l % 2 == 0 and token[:l // 2] == token[l // 2:]:
            ret.append(token[:l // 2] + 'ː')
        else:
            ret.append(token)
    return ret
Exemplo n.º 7
0
    def __init__(
        self,
        words,
        tokens=False,
        prostrings=[],
        classes=False,
        class_model=rcParams['model'],
        **keywords
    ):
        setdefaults(keywords, stress=rcParams['stress'],
                diacritics=rcParams['diacritics'], cldf=False)
        self.model = class_model
        self.words = words
        self.tokens = []
        self.bigrams = []
        self.classes = []

        # start filling the dictionary
        for i, w in enumerate(words):

            # check for tokenized string
            if not tokens:
                tk = ipa2tokens(w, **keywords)
            else:
                tk = w[:]
            self.tokens += [tk]

            # create prosodic string
            if prostrings:
                p = prostrings[i]
            else:
                tt = tokens2class(tk, rcParams['art'])
                p = prosodic_string(
                        tk, 
                        rcParams['art'],
                        cldf=keywords['cldf'],
                        diacritics=keywords['diacritics'],
                        stress=keywords['stress'])
            # create classes
            if classes:
                c = tokens2class(tk, class_model, cldf=keywords['cldf'],
                        diacritics=keywords['diacritics'],
                        stress=keywords['stress'])
                bigrams = list(zip(p, c))
                self.classes += [c]
            else:
                # zip the stuff
                bigrams = list(zip(p, tk))

            # start appending the stuff
            self.bigrams += [bigrams]

            # init the mother object
            MCBasic.__init__(self, self.bigrams)
Exemplo n.º 8
0
def i2t(ipa):
    ipa = unicodedata.normalize('NFD', ipa)
    ipa = re.sub(r'^\*', '', ipa)
    tokens = ipa2tokens(ipa, merge_vowels=False, merge_geminates=False)
    ret = list()
    for t in tokens:
        # NOTE(j_luo) Stress symbol is not handled by `ipapy`'s canonicalization process.
        t = t.replace("'", 'ˈ')
        # NOTE(j_luo) Not sure what these symbols mean.
        t = t.replace('̣', '').replace('̧', '').replace('̦', '')
        ret.append(str(IPAString(unicode_string=t)))
    return ret
Exemplo n.º 9
0
def test_ipa2tokens():

    seq = 'ˈtʲʰoɔːix_tərp͡f¹¹'

    assert len(ipa2tokens(seq)) != len(list(seq))

    seq = 'ʰto͡i'
    
    assert len(ipa2tokens(seq)) == 2

    seq = 'th o x t a'
    
    assert len(ipa2tokens(seq)) == len(seq.split(' '))

    seq = '# b l a #'
    
    assert len(ipa2tokens(seq)) == len(seq.split(' '))-2

    # now check with all possible data we have so far
    tokens = csv2list(test_data('test_tokenization.csv'))
    
    for a,b in tokens:
        
        tks1 = ' '.join(ipa2tokens(a))
        tks2 = ' '.join(ipa2tokens(a, merge_vowels=False))

        # we check for two variants, since we don't know whether vowels are
        # merged or not in the test data
        assert tks1 == b or tks2 == b
Exemplo n.º 10
0
def test_ipa2tokens():

    seq = 'ˈtʲʰoɔːix_tərp͡f¹¹'

    assert len(ipa2tokens(seq)) != len(list(seq))

    seq = 'ʰto͡i'

    assert len(ipa2tokens(seq)) == 2

    seq = 'th o x t a'

    assert len(ipa2tokens(seq)) == len(seq.split(' '))

    seq = '# b l a #'

    assert len(ipa2tokens(seq)) == len(seq.split(' ')) - 2

    # now check with all possible data we have so far
    tokens = csv2list(test_data('test_tokenization.csv'))

    for a, b in tokens:

        tks1 = ' '.join(ipa2tokens(a))
        tks2 = ' '.join(ipa2tokens(a, merge_vowels=False))

        # we check for two variants, since we don't know whether vowels are
        # merged or not in the test data
        assert tks1 == b or tks2 == b
Exemplo n.º 11
0
def turchin(seqA, seqB, model='dolgo', **keywords):
    """
    Return cognate judgment based on the method by :evobib:`Turchin2010`.

    Parameters
    ----------
    seqA, seqB : {str, list, tuple}
        The input strings. These should be iterables, so you can use tuples,
        lists, or strings.
    model : {"asjp", "sca", "dolgo"} (default="dolgo")
        A sound-class model instance or a string that denotes one of the
        standard sound class models used in LingPy.

    Returns
    -------
    cognacy : {0, 1}
        The cognacy assertion which is either 0 (words are probably cognate) or
        1 (words are not likely to be cognate).

    """
    if text_type(model) == model:
        model = rcParams[model]
    elif not hasattr(model, 'info'):
        raise ValueError("[!] No valid model instance selected.")

    if isinstance(seqA, string_types):
        seqA = ipa2tokens(seqA)
        seqB = ipa2tokens(seqB)

    classA = tokens2class(seqA, model)
    classB = tokens2class(seqB, model)

    if classA[0] in model.vowels:
        classA[0] = 'H'
    if classB[0] in model.vowels:
        classB[0] = 'H'

    return int(''.join([k for k in classA if k not in model.vowels])[:2] !=
               ''.join([k for k in classB if k not in model.vowels])[:2])
Exemplo n.º 12
0
def ipa_to_asjp(w, params):
    """
	Lingpy IPA-to-ASJP converter plus some cleanup.
	Expects the params {} to contain the key: sounds.
	
	This function is called on IPA datasets.
	"""
    w = w.replace('\"', '').replace('-', '').replace(' ', '')
    wA = ''.join(tokens2class(ipa2tokens(w, merge_vowels=False), 'asjp'))
    wAA = clean_asjp(wA.replace('0', '').replace('I', '3').replace('H', 'N'))
    asjp = ''.join([x for x in wAA if x in params['sounds']])
    assert len(asjp) > 0
    return asjp
Exemplo n.º 13
0
def test_ipa2tokens():

    seq = 'ˈtʲʰoɔːix_tərp͡f¹¹'

    assert len(ipa2tokens(seq)) != len(list(seq))

    seq = 'ʰto͡i'

    assert len(ipa2tokens(seq)) == 2

    seq = 'th o x t a'

    assert len(ipa2tokens(seq)) == len(seq.split(' '))

    seq = '# b l a #'

    assert len(ipa2tokens(seq)) == len(seq.split(' ')) - 2

    # now check with all possible data we have so far, but only on cases where
    # tokenization doesn't require the merge_vowels = False flag
    tokens = csv2list(test_data('test_tokenization.tsv'))

    for a, b in tokens:

        tks = ' '.join(ipa2tokens(a))

        # we check for two variants, since we don't know whether vowels are
        # merged or not in the test data
        assert tks == b

    # now test on smaller set with unmerged vowels
    tokens = csv2list(test_data('test_tokenization_mv.tsv'))

    for a, b in tokens:

        tks = ' '.join(ipa2tokens(a, merge_vowels=False,
                                  merge_geminates=False))

        # we check for two variants, since we don't know whether vowels are
        # merged or not in the test data
        assert tks == b

    tokens = csv2list(test_data('test_tokenization_nasals.tsv'))
    for a, b in tokens:
        tks = ' '.join(
            ipa2tokens(a,
                       merge_vowels=True,
                       merge_geminates=True,
                       expand_nasals=True,
                       semi_diacritics='h'))
        assert tks == b
Exemplo n.º 14
0
def test_ipa2tokens():

    seq = 'ˈtʲʰoɔːix_tərp͡f¹¹'

    assert len(ipa2tokens(seq)) != len(list(seq))

    seq = 'ʰto͡i'
    
    assert len(ipa2tokens(seq)) == 2

    seq = 'th o x t a'
    
    assert len(ipa2tokens(seq)) == len(seq.split(' '))

    seq = '# b l a #'
    
    assert len(ipa2tokens(seq)) == len(seq.split(' '))-2

    # now check with all possible data we have so far, but only on cases where
    # tokenization doesn't require the merge_vowels = False flag
    tokens = csv2list(test_data('test_tokenization.tsv'))
    
    for a,b in tokens:
        
        tks = ' '.join(ipa2tokens(a))

        # we check for two variants, since we don't know whether vowels are
        # merged or not in the test data
        assert tks == b

    # now test on smaller set with unmerged vowels 
    tokens = csv2list(test_data('test_tokenization_mv.tsv'))
    
    for a,b in tokens:
        
        tks = ' '.join(ipa2tokens(a, merge_vowels=False, merge_geminates=False))

        # we check for two variants, since we don't know whether vowels are
        # merged or not in the test data
        assert tks == b

    tokens = csv2list(test_data('test_tokenization_nasals.tsv'))
    for a,b in tokens:
        print(tks)
        tks = ' '.join(ipa2tokens(a, merge_vowels=True, merge_geminates=True,
            expand_nasals=True, semi_diacritics='h'))
        assert tks == b
Exemplo n.º 15
0
    def __init__(self, seqs, seqB=False, **keywords):
        # check, whether there are only two sequences or multiple sequence
        # pairs as input
        if seqB:
            self.seqs = [(seqs, seqB)]
        else:
            self.seqs = seqs

        # add the basic representation of sequences
        self.tokens = []
        self.prostrings = []

        # define a tokenizer function for convenience
        defaults = {
            "diacritics": rcParams['diacritics'],
            "vowels": rcParams['vowels'],
            "tones": rcParams['tones'],
            "combiners": rcParams['combiners'],
            "breaks": rcParams['breaks'],
            "stress": rcParams['stress'],
            "merge_vowels": rcParams['merge_vowels']
        }
        for k in defaults:
            if k not in keywords:
                keywords[k] = defaults[k]

        tokenize = lambda x: ipa2tokens(x, **keywords)

        # start to loop over data and create the stuff
        for k, (seqA, seqB) in enumerate(self.seqs):
            # get the tokens
            tokA, tokB = tokenize(seqA), tokenize(seqB)

            # get the prostrings
            proA, proB = \
                prosodic_string(tokA, **keywords), prosodic_string(tokB, **keywords)

            # append the stuff
            self.tokens += [[tokA, tokB]]
            self.prostrings += [[proA, proB]]
Exemplo n.º 16
0
    def __init__(self, seqs, seqB=False, **keywords):
        # check, whether there are only two sequences or multiple sequence
        # pairs as input
        if seqB:
            self.seqs = [(seqs, seqB)]
        else:
            self.seqs = seqs

        # add the basic representation of sequences
        self.tokens = []
        self.prostrings = []

        # define a tokenizer function for convenience
        defaults = {
            "diacritics": rcParams['diacritics'],
            "vowels": rcParams['vowels'],
            "tones": rcParams['tones'],
            "combiners": rcParams['combiners'],
            "breaks": rcParams['breaks'],
            "stress": rcParams['stress'],
            "merge_vowels": rcParams['merge_vowels']
        }
        for k in defaults:
            if k not in keywords:
                keywords[k] = defaults[k]

        tokenize = lambda x: ipa2tokens(x, **keywords)

        # start to loop over data and create the stuff
        for k, (seqA, seqB) in enumerate(self.seqs):
            # get the tokens
            tokA, tokB = tokenize(seqA), tokenize(seqB)

            # get the prostrings
            proA, proB = \
                prosodic_string(tokA, **keywords), prosodic_string(tokB, **keywords)

            # append the stuff
            self.tokens += [[tokA, tokB]]
            self.prostrings += [[proA, proB]]
Exemplo n.º 17
0
def i2t(ipa: str) -> List[str]:
    """ipa2token call. Raises error if return is empty."""
    ret = ipa2tokens(ipa, merge_vowels=True, merge_geminates=False)
    if not ret:
        raise I2tException
    return ret
Exemplo n.º 18
0
def msa2html(msa, shorttitle='', filename='', template='', **keywords):
    """
    Convert files in ``msa``-format into colored ``html``-format.

    Parameters
    ----------
    msa : dict
        A dictionary object that contains all the information of an MSA object.

    shorttitle : str
        Define the shorttitle of the ``html``-page. If no title is provided,
        the default title ``SCA`` will be used.

    filename : str (default="")
        Define the name of the output file. If no name is defined, the name of
        the input file will be taken as a default.

    template : str (default="")
        The path to the template file. If no name is defined, the basic
        template will be used. The basic template currently used can be found
        under ``lingpy/data/templates/msa2html.html``.

    Examples
    --------
    Load the libary.

    >>> from lingpy import *
    
    Load an ``msq``-file from the test-sets.

    >>> msa = MSA('harry.msq')

    Align the data progressively and carry out a check for swapped sites.

    >>> msa.prog_align()
    >>> msa.swap_check()
    >>> print(msa)
    w    o    l    -    d    e    m    o    r    t
    w    a    l    -    d    e    m    a    r    -
    v    -    l    a    d    i    m    i    r    -

    Save the data to the file ``harry.msa``.

    >>> msa.output('msa',filename='harry')

    Save the ``msa``-object as ``html``.

    >>> msa.output('html',filename='harry')
    
    Notes
    -----
    The coloring of sound segments with respect to the sound class they belong
    to is based on the definitions given in the ``color``
    :py:class:`~lingpy.data.model.Model`. It can easily be changed and adapted.
    

    See also
    --------
    lingpy.convert.html.alm2html
    """
    util.setdefaults(
        keywords,
        pid_mode=1,
        stress=rcParams['stress'],
        css=False,
        js=False,
        compact=False,
        class_sort=True,
        write_to_file=True,
    )

    # while alm-format can be read from the text-file without problems,
    # msa-format should be loaded first (once this is already provided), the
    # loss in speed won't matter much, since output of data is not a daily task

    # load templates
    template = template or template_path('msa2html.html')
    if template == 'js':
        template = template_path('msa2html.js.html')
    html = util.read_text_file(template)
    css = util.read_text_file(keywords['css'] or template_path('msa.css'))
    js = util.read_text_file(keywords['js'] or template_path('msa.js'))

    # treat the msa-object as a file and try to load the file if this is the
    # case
    if isinstance(msa, string_types):
        msa = read_msa(msa, **keywords)
    else:
        raise ValueError('[!] No filename specified.')

    # load dataset, etc.
    dataset = msa['dataset']

    # calculate pid score, if it is not passed as argument
    if 'pid_score' not in keywords:
        pid_score = 0
        count = 0
        for i, seqA in enumerate(msa['alignment']):
            for j, seqB in enumerate(msa['alignment']):
                if i < j:
                    pid_score += pid(seqA, seqB, mode=keywords['pid_mode'])
                    count += 1
        pid_score = int(100 * pid_score / count + 0.5)
    else:
        pid_score = keywords['pid_score']

    infile = msa['infile']
    seq_id = msa['seq_id']

    # define the titles etc.
    if not shorttitle:
        shorttitle = 'SCA'

    # determine the length of the longest taxon
    taxl = max([len(t) for t in msa['taxa']])

    # format css file
    css = css.replace('TAXON_LENGTH', str(taxl * 10))

    out = ''
    tr = '<tr class="msa" unique="{1}" taxon={2} sequence={3}>{0}</tr>\n'
    td_taxon = '<td class="taxon">{0}</td>'
    perc = int(80 / len(msa['alignment'][0]) + 0.5)
    td_residue = '<td class="residue {1}">{0}</td>'
    td_swap = '<td class="residue swap {1}">{0}</td>'
    td_unaligned = '<td class="residue noalign {1}">{0}</td>'

    # check for swaps in the alignment
    if 'swaps' in msa:
        swaps = []
        for s in msa['swaps']:
            swaps.extend(s)
    else:
        swaps = []

    # check for
    local = ['*'] * len(msa['alignment'][0])
    if 'local' in msa:
        local = ['.'] * len(msa['alignment'][0])
        for i in msa['local']:
            local[i] = '*'

    # get two sorting schemas for the sequences
    if keywords['class_sort']:

        classes = [
            tokens2class(ipa2tokens(seq), rcParams['asjp'])
            for seq in msa['seqs']
        ]
        seqs = dict([
            (a[1], b) for a, b in zip(
                sorted(
                    zip(classes, msa['seqs']),
                    key=lambda x: x[0]  # list(zip(x[0],x[1]))
                ),
                range(1,
                      len(msa['seqs']) + 1))
        ])
    else:
        seqs = dict(zip(sorted(msa['seqs']), range(1, len(msa['seqs']) + 1)))
    taxa = dict(zip(sorted(msa['taxa']), range(1, len(msa['taxa']) + 1)))

    # set up a list to store unique alignments
    alignments = []

    # start iteration
    for i, taxon in enumerate(msa['taxa']):
        tmp = ''
        tmp += td_taxon.format(taxon)

        # append alignment to alignments
        alignment = ''.join(msa['alignment'][i])
        sequence = msa['seqs'][i]
        if alignment in alignments:
            unique = 'false'
        else:
            unique = 'true'
            alignments += [alignment]

        for j, char in enumerate(msa['alignment'][i]):
            if char == '-':
                d = 'dolgo_GAP'
                c = '#bbbbbb'
            else:
                d = 'dolgo_' + token2class(char, rcParams['dolgo'])
                c = token2class(char, rcParams['_color'])

                # bad check for three classes named differently
                if d == 'dolgo__':
                    d = 'dolgo_X'
                elif d == 'dolgo_1':
                    d = 'dolgo_TONE'
                elif d == 'dolgo_0':
                    d = 'dolgo_ERROR'

            if j in swaps:
                tmp += td_swap.format(char, d)
            elif local[j] != '*':
                tmp += td_unaligned.format(char, d)
            else:
                tmp += td_residue.format(char, d)
        out += tr.format(tmp, unique, taxa[taxon], seqs[sequence])

    html = html.format(
        table=out,
        dataset=dataset,
        pid=pid_score,
        file=infile,
        sequence=seq_id,
        shorttitle=shorttitle,
        width=len(msa['alignment'][0]),
        table_width='{0}'.format(len(msa['alignment'][0]) * 50 + 8 * taxl),
        taxa=len(msa['alignment']),
        uniseqs=len(set(msa['seqs'])),
        css=css,
        js=js)

    if not filename:
        filename = rcParams['filename']

    if not filename.endswith('.html'):
        filename = filename + '.html'

    if keywords['compact']:
        html = html.replace('\n', ' ')
        html = re.sub(r'\s+', r' ', html)
        html = html.replace('> ', '>')
        html = html.replace(' >', '>')

    if keywords['write_to_file']:
        # check, whether the outfile already exists
        util.write_text_file(filename, html)
    else:
        return html
Exemplo n.º 19
0
def msa2html(
    msa,
    shorttitle='',
    filename='',
    template='',
    **keywords
):
    """
    Convert files in ``msa``-format into colored ``html``-format.

    Parameters
    ----------
    msa : dict
        A dictionary object that contains all the information of an MSA object.

    shorttitle : str
        Define the shorttitle of the ``html``-page. If no title is provided,
        the default title ``SCA`` will be used.

    filename : str (default="")
        Define the name of the output file. If no name is defined, the name of
        the input file will be taken as a default.

    template : str (default="")
        The path to the template file. If no name is defined, the basic
        template will be used. The basic template currently used can be found
        under ``lingpy/data/templates/msa2html.html``.

    Examples
    --------
    Load the libary.

    >>> from lingpy import *
    
    Load an ``msq``-file from the test-sets.

    >>> msa = MSA('harry.msq')

    Align the data progressively and carry out a check for swapped sites.

    >>> msa.prog_align()
    >>> msa.swap_check()
    >>> print(msa)
    w    o    l    -    d    e    m    o    r    t
    w    a    l    -    d    e    m    a    r    -
    v    -    l    a    d    i    m    i    r    -

    Save the data to the file ``harry.msa``.

    >>> msa.output('msa',filename='harry')

    Save the ``msa``-object as ``html``.

    >>> msa.output('html',filename='harry')
    
    Notes
    -----
    The coloring of sound segments with respect to the sound class they belong
    to is based on the definitions given in the ``color``
    :py:class:`~lingpy.data.model.Model`. It can easily be changed and adapted.
    

    See also
    --------
    lingpy.convert.html.alm2html
    """
    util.setdefaults(
        keywords,
        pid_mode=1,
        stress=rcParams['stress'],
        css=False,
        js=False,
        compact=False,
        class_sort=True,
        write_to_file=True,
    )

    # while alm-format can be read from the text-file without problems,
    # msa-format should be loaded first (once this is already provided), the
    # loss in speed won't matter much, since output of data is not a daily task

    # load templates
    template = template or template_path('msa2html.html')
    if template == 'js':
        template = template_path('msa2html.js.html')
    html = util.read_text_file(template)
    css = util.read_text_file(keywords['css'] or template_path('msa.css'))
    js = util.read_text_file(keywords['js'] or template_path('msa.js'))

    # treat the msa-object as a file and try to load the file if this is the
    # case
    if isinstance(msa, string_types):
        msa = read_msa(msa, **keywords)
    else:
        raise ValueError('[!] No filename specified.')

    # load dataset, etc.
    dataset = msa['dataset']

    # calculate pid score, if it is not passed as argument
    if 'pid_score' not in keywords:
        pid_score = 0
        count = 0
        for i, seqA in enumerate(msa['alignment']):
            for j, seqB in enumerate(msa['alignment']):
                if i < j:
                    pid_score += pid(seqA, seqB, mode=keywords['pid_mode'])
                    count += 1
        pid_score = int(100 * pid_score / count + 0.5)
    else:
        pid_score = keywords['pid_score']

    infile = msa['infile']
    seq_id = msa['seq_id']

    # define the titles etc.
    if not shorttitle:
        shorttitle = 'SCA'

    # determine the length of the longest taxon
    taxl = max([len(t) for t in msa['taxa']])

    # format css file 
    css = css.replace('TAXON_LENGTH', str(taxl * 10))

    out = ''
    tr = '<tr class="msa" unique="{1}" taxon={2} sequence={3}>{0}</tr>\n'
    td_taxon = '<td class="taxon">{0}</td>'
    perc = int(80 / len(msa['alignment'][0]) + 0.5)
    td_residue = '<td class="residue {1}">{0}</td>'
    td_swap = '<td class="residue swap {1}">{0}</td>'
    td_unaligned = '<td class="residue noalign {1}">{0}</td>'

    # check for swaps in the alignment
    if 'swaps' in msa:
        swaps = []
        for s in msa['swaps']:
            swaps.extend(s)
    else:
        swaps = []

    # check for 
    local = ['*'] * len(msa['alignment'][0])
    if 'local' in msa:
        local = ['.'] * len(msa['alignment'][0])
        for i in msa['local']:
            local[i] = '*'

    # get two sorting schemas for the sequences
    if keywords['class_sort']:

        classes = [tokens2class(ipa2tokens(seq), rcParams['asjp']) for seq in msa['seqs']]
        seqs = dict(
            [(a[1], b) for a, b in zip(
                sorted(
                    zip(classes, msa['seqs']),
                    key=lambda x: x[0]  # list(zip(x[0],x[1]))
                ),
                range(1, len(msa['seqs']) + 1)
            )]
        )
    else:
        seqs = dict(zip(sorted(msa['seqs']), range(1, len(msa['seqs']) + 1)))
    taxa = dict(zip(sorted(msa['taxa']), range(1, len(msa['taxa']) + 1)))

    # set up a list to store unique alignments
    alignments = []

    # start iteration
    for i, taxon in enumerate(msa['taxa']):
        tmp = ''
        tmp += td_taxon.format(taxon)

        # append alignment to alignments
        alignment = ''.join(msa['alignment'][i])
        sequence = msa['seqs'][i]
        if alignment in alignments:
            unique = 'false'
        else:
            unique = 'true'
            alignments += [alignment]

        for j, char in enumerate(msa['alignment'][i]):
            if char == '-':
                d = 'dolgo_GAP'
                c = '#bbbbbb'
            else:
                d = 'dolgo_' + token2class(char, rcParams['dolgo'])
                c = token2class(char, rcParams['_color'])

                # bad check for three classes named differently
                if d == 'dolgo__':
                    d = 'dolgo_X'
                elif d == 'dolgo_1':
                    d = 'dolgo_TONE'
                elif d == 'dolgo_0':
                    d = 'dolgo_ERROR'

            if j in swaps:
                tmp += td_swap.format(char, d)
            elif local[j] != '*':
                tmp += td_unaligned.format(char, d)
            else:
                tmp += td_residue.format(char, d)
        out += tr.format(tmp, unique, taxa[taxon], seqs[sequence])

    html = html.format(
        table=out,
        dataset=dataset,
        pid=pid_score,
        file=infile,
        sequence=seq_id,
        shorttitle=shorttitle,
        width=len(msa['alignment'][0]),
        table_width='{0}'.format(len(msa['alignment'][0]) * 50 + 8 * taxl),
        taxa=len(msa['alignment']),
        uniseqs=len(set(msa['seqs'])),
        css=css,
        js=js
    )

    if not filename:
        filename = rcParams['filename']

    if not filename.endswith('.html'):
        filename = filename + '.html'

    if keywords['compact']:
        html = html.replace('\n', ' ')
        html = re.sub(r'\s+', r' ', html)
        html = html.replace('> ', '>')
        html = html.replace(' >', '>')

    if keywords['write_to_file']:
        # check, whether the outfile already exists
        util.write_text_file(filename, html)
    else:
        return html
Exemplo n.º 20
0
def load_dataset(input_path, source, input_type, output_path):
    print(
        " - Loading dataset and performing necessary conversion/tokenization.")
    if os.path.exists(output_path):
        print("Using existing wordlist file, nothing is generated.")
        return
    # No NA filter: the word form 'nan' should not be interpreted as NaN :p
    df = pd.read_csv(input_path, sep="\t", na_filter=False)

    # Depending on file format, remove and/or rename columns
    if source == "ielex" or source == "ielex-corr":
        # Rename columns
        df.rename(columns={
            "Language": "DOCULECT",
            "Meaning": "CONCEPT",
            "Phonological Form": "IPA",
            "Cognate Class": "COGNATES_IELEX",
            "cc": "CONCEPT_COGNATES_IELEX"
        },
                  inplace=True)
        # Drop column with unused numbers
        df.drop(df.columns[[0]], axis=1, inplace=True)
    elif source == "northeuralex":
        df.rename(columns={
            "Language_ID": "DOCULECT",
            "Concept_ID": "CONCEPT"
        },
                  inplace=True)

    tokens = []
    if source == "ielex":
        # Perform IPA->ASJP conversion if source is ielex
        forms = []
        for form_ipa in df["IPA"]:
            # ipa_to_asjp method accepts both space-separated (NELex) and
            # non-separated (IELex)
            if input_type == "asjp":
                form_asjp = utility.ipa_to_asjp(form_ipa)
                forms.append(form_asjp)
                tokens_form = list(form_asjp)
            elif input_type == "ipa":
                tokens_form = ipa2tokens(form_ipa)
            tokens_string = " ".join(tokens_form)
            tokens.append(tokens_string)
        if input_type == "asjp":
            df["ASJP"] = forms
        df["TOKENS"] = tokens
    elif source == "northeuralex":
        if input_type == "asjp":
            for form_asjp in df["ASJP"]:
                tokens_form = list(form_asjp)
                tokens_string = " ".join(tokens_form)
                tokens.append(tokens_string)
            df["TOKENS"] = tokens
        elif input_type == "ipa":
            df["TOKENS"] = df[input_type]
    # Filter out rows with XXX phonology field.
    df = df[df["IPA"] != "XXX"]
    # Filter out rows with empty phonology field
    df = df[df["IPA"] != ""]

    # Apply IELex cognate judgments to NElex
    # TODO: We can only do this if there is a publicly available intersection file
    #
    # if source == "northeuralex":
    #     # Load intersection file
    #     df_intersection = pd.read_csv(intersection_path, sep="\t")
    #     # Per row, retrieve matching IELex judgment from intersection
    #     cognates_intersection = []
    #     for _, row in df.iterrows():
    #         cog = df_intersection[((df_intersection["iso_code"] == row["DOCULECT"]) & (df_intersection["gloss_northeuralex"] == row["CONCEPT"]) & (df_intersection["ortho_northeuralex"] == row["COUNTERPART"]))]["cog_class_ielex"]
    #         if cog.empty:
    #             cog = None
    #         else:
    #             cog = cog.iloc[0]
    #         cognates_intersection.append(cog)
    #     df["COGNATES_IELEX"] = cognates_intersection
    #     # Create CONCEPT_COGNATES_IELEX column with unique cognate classes across concepts
    #     df["CONCEPT_COGNATES_IELEX"] = df["CONCEPT"] + "-" + df["COGNATES_IELEX"]

    print(f" - Writing corpus (with conversions) to {output_path}")
    df.to_csv(output_path, index_label="ID", sep="\t")