Exemplo n.º 1
0
    def test_prosodic_string(self):
        seq = 'tʰ ɔ x t ə r'.split(' ')
        assert prosodic_string(seq) == 'AXMBYN'

        seq = 'th o x ¹ t e'.split(' ')
        assert prosodic_string(seq) == 'AXLTBZ'

        seq = 'th o x _ th o x'.split(' ')
        assert prosodic_string(seq) == 'AXN_AXN'
        assert not prosodic_string('')
Exemplo n.º 2
0
    def test_prosodic_string(self):
        seq = 'tʰ ɔ x t ə r'.split(' ')
        assert prosodic_string(seq) == 'AXMBYN'

        seq = 'th o x ¹ t e'.split(' ')
        assert prosodic_string(seq) == 'AXLTBZ'

        seq = 'th o x _ th o x'.split(' ')
        assert prosodic_string(seq) == 'AXN_AXN'
        assert not prosodic_string('')
Exemplo n.º 3
0
def edit_dist(seqA, seqB, normalized=False, restriction=''):
    """
    Return the edit distance between two strings.

    Parameters
    ----------
    seqA,seqB : str
        The strings that shall be compared.
    normalized : bool (default=False)
        Specify whether the normalized edit distance shall be returned. If no
        restrictions are chosen, the edit distance is normalized by dividing by
        the length of the longer string. If *restriction* is set to *cv*
        (consonant-vowel), the edit distance is normalized by the length of the
        alignment.
    restriction : {"cv"} (default="")
        Specify the restrictions to be used. Currently, only ``cv`` is
        supported. This prohibits matches of vowels with consonants.

    Notes
    -----
    The edit distance was first formally defined by V. I. Levenshtein
    (:evobib:`Levenshtein1965`). The first algorithm to compute the edit
    distance was proposed by Wagner and Fisher (:evobib:`Wagner1974`).

    Returns
    -------
    dist : {int float}
        The edit distance, which is a float if normalized is set to c{True},
        and an integer otherwise.

    Examples
    --------
    >>> seqA = 'fat cat'
    >>> setB = 'catfat'
    >>> edit_dist(seqA, seqB)
    3

    """
    # check whether the sequences are tuples
    if isinstance(seqA, (text_type, tuple)):
        seqA = list(seqA)
        seqB = list(seqB)
    elif not isinstance(seqA, list):
        raise ValueError("Input should be tuple, list, or string.")

    if restriction in ['cv', 'consonant-vowel']:
        resA = prosodic_string(seqA, 'cv')
        resB = prosodic_string(seqB, 'cv')
        return malign.restricted_edit_dist(seqA, seqB, resA, resB, normalized)

    return malign.edit_dist(seqA, seqB, normalized)
Exemplo n.º 4
0
    def evaluate_string(self, string, tokens=False, **keywords):
        setdefaults(keywords, stress=rcParams['stress'],
                diacritics=rcParams['diacritics'], cldf=False)
        if not tokens:
            tokens = ipa2tokens(string)
        score = 1
        dist = self.dist['#']

        prostring = prosodic_string(tokens, rcParams['art'], cldf=keywords['cldf'],
                        diacritics=keywords['diacritics'],
                        stress=keywords['stress'])
        if self.classes:
            c = tokens2class(tokens, self.model, cldf=keywords['cldf'],
                        diacritics=keywords['diacritics'],
                        stress=keywords['stress'])
            teststring = list(zip(prostring, c))
        else:
            teststring = list(zip(prostring, tokens))

        scores = []

        while len(teststring) > 0:
            segment = teststring.pop(0)
            freq = dist.count(segment)
            allf = len(dist)
            s = freq / allf
            score = score * s
            scores += [s]
            dist = self.dist[segment]
        score = score * s
        scores += [s]
        lscore = np.log10(score)
        lscore = lscore / len(tokens)
        return score, lscore  # np.log10(score)
Exemplo n.º 5
0
    def evaluate_string(self, string, tokens=False, **keywords):
        setdefaults(keywords, stress=rcParams['stress'],
                diacritics=rcParams['diacritics'], cldf=False)
        if not tokens:
            tokens = ipa2tokens(string)
        score = 1
        dist = self.dist['#']

        prostring = prosodic_string(tokens, rcParams['art'], cldf=keywords['cldf'],
                        diacritics=keywords['diacritics'],
                        stress=keywords['stress'])
        if self.classes:
            c = tokens2class(tokens, self.model, cldf=keywords['cldf'],
                        diacritics=keywords['diacritics'],
                        stress=keywords['stress'])
            teststring = list(zip(prostring, c))
        else:
            teststring = list(zip(prostring, tokens))

        scores = []

        while len(teststring) > 0:
            segment = teststring.pop(0)
            freq = dist.count(segment)
            allf = len(dist)
            s = freq / allf
            score = score * s
            scores += [s]
            dist = self.dist[segment]
        score = score * s
        scores += [s]
        lscore = np.log10(score)
        lscore = lscore / len(tokens)
        return score, lscore  # np.log10(score)
Exemplo n.º 6
0
def edit_dist(seqA, seqB, normalized=False, restriction=''):
    """
    Return the edit distance between two strings.

    Parameters
    ----------
    seqA,seqB : str
        The strings that shall be compared.
    normalized : bool (default=False)
        Specify whether the normalized edit distance shall be returned. If no
        restrictions are chosen, the edit distance is normalized by dividing by
        the length of the longer string. If *restriction* is set to *cv*
        (consonant-vowel), the edit distance is normalized by the length of the
        alignment.
    restriction : {"cv"} (default="")
        Specify the restrictions to be used. Currently, only ``cv`` is
        supported. This prohibits matches of vowels with consonants.

    Notes
    -----
    The edit distance was first formally defined by V. I. Levenshtein
    (:evobib:`Levenshtein1965`). The first algorithm to compute the edit
    distance was proposed by Wagner and Fisher (:evobib:`Wagner1974`).

    Returns
    -------
    dist : {int float}
        The edit distance, which is a float if normalized is set to c{True},
        and an integer otherwise.

    Examples
    --------
    Align two sequences::
        >>> seqA = 'fat cat'
        >>> seqB = 'catfat'
        >>> edit_dist(seqA, seqB)
        3

    """
    seqA, seqB = _as_lists(seqA, seqB)
    if restriction in ['cv', 'consonant-vowel']:
        resA = prosodic_string(seqA, 'cv')
        resB = prosodic_string(seqB, 'cv')
        return malign.restricted_edit_dist(seqA, seqB, resA, resB, normalized)

    return malign.edit_dist(seqA, seqB, normalized)
Exemplo n.º 7
0
    def __init__(self,
                 words,
                 tokens=False,
                 prostrings=[],
                 classes=False,
                 class_model=rcParams['model'],
                 **keywords):
        setdefaults(keywords,
                    stress=rcParams['stress'],
                    diacritics=rcParams['diacritics'],
                    cldf=False)
        self.model = class_model
        self.words = words
        self.tokens = []
        self.bigrams = []
        self.classes = []

        # start filling the dictionary
        for i, w in enumerate(words):

            # check for tokenized string
            if not tokens:
                tk = ipa2tokens(w, **keywords)
            else:
                tk = w[:]
            self.tokens += [tk]

            # create prosodic string
            if prostrings:
                p = prostrings[i]
            else:
                print(w, tk)
                tt = tokens2class(tk, rcParams['art'])
                print(tt)
                p = prosodic_string(tk,
                                    rcParams['art'],
                                    cldf=keywords['cldf'],
                                    diacritics=keywords['diacritics'],
                                    stress=keywords['stress'])
            # create classes
            if classes:
                c = tokens2class(tk,
                                 class_model,
                                 cldf=keywords['cldf'],
                                 diacritics=keywords['diacritics'],
                                 stress=keywords['stress'])
                bigrams = list(zip(p, c))
                self.classes += [c]
            else:
                # zip the stuff
                bigrams = list(zip(p, tk))

            # start appending the stuff
            self.bigrams += [bigrams]

            # init the mother object
            MCBasic.__init__(self, self.bigrams)
Exemplo n.º 8
0
    def __init__(
        self,
        words,
        tokens=False,
        prostrings=[],
        classes=False,
        class_model=rcParams['model'],
        **keywords
    ):
        setdefaults(keywords, stress=rcParams['stress'],
                diacritics=rcParams['diacritics'], cldf=False)
        self.model = class_model
        self.words = words
        self.tokens = []
        self.bigrams = []
        self.classes = []

        # start filling the dictionary
        for i, w in enumerate(words):

            # check for tokenized string
            if not tokens:
                tk = ipa2tokens(w, **keywords)
            else:
                tk = w[:]
            self.tokens += [tk]

            # create prosodic string
            if prostrings:
                p = prostrings[i]
            else:
                tt = tokens2class(tk, rcParams['art'])
                p = prosodic_string(
                        tk, 
                        rcParams['art'],
                        cldf=keywords['cldf'],
                        diacritics=keywords['diacritics'],
                        stress=keywords['stress'])
            # create classes
            if classes:
                c = tokens2class(tk, class_model, cldf=keywords['cldf'],
                        diacritics=keywords['diacritics'],
                        stress=keywords['stress'])
                bigrams = list(zip(p, c))
                self.classes += [c]
            else:
                # zip the stuff
                bigrams = list(zip(p, tk))

            # start appending the stuff
            self.bigrams += [bigrams]

            # init the mother object
            MCBasic.__init__(self, self.bigrams)
Exemplo n.º 9
0
    def __init__(self, seqs, seqB=False, **keywords):
        # check, whether there are only two sequences or multiple sequence
        # pairs as input
        if seqB:
            self.seqs = [(seqs, seqB)]
        else:
            self.seqs = seqs

        # add the basic representation of sequences
        self.tokens = []
        self.prostrings = []

        # define a tokenizer function for convenience
        defaults = {
            "diacritics": rcParams['diacritics'],
            "vowels": rcParams['vowels'],
            "tones": rcParams['tones'],
            "combiners": rcParams['combiners'],
            "breaks": rcParams['breaks'],
            "stress": rcParams['stress'],
            "merge_vowels": rcParams['merge_vowels']
        }
        for k in defaults:
            if k not in keywords:
                keywords[k] = defaults[k]

        tokenize = lambda x: ipa2tokens(x, **keywords)

        # start to loop over data and create the stuff
        for k, (seqA, seqB) in enumerate(self.seqs):
            # get the tokens
            tokA, tokB = tokenize(seqA), tokenize(seqB)

            # get the prostrings
            proA, proB = \
                prosodic_string(tokA, **keywords), prosodic_string(tokB, **keywords)

            # append the stuff
            self.tokens += [[tokA, tokB]]
            self.prostrings += [[proA, proB]]
Exemplo n.º 10
0
    def __init__(self, seqs, seqB=False, **keywords):
        # check, whether there are only two sequences or multiple sequence
        # pairs as input
        if seqB:
            self.seqs = [(seqs, seqB)]
        else:
            self.seqs = seqs

        # add the basic representation of sequences
        self.tokens = []
        self.prostrings = []

        # define a tokenizer function for convenience
        defaults = {
            "diacritics": rcParams['diacritics'],
            "vowels": rcParams['vowels'],
            "tones": rcParams['tones'],
            "combiners": rcParams['combiners'],
            "breaks": rcParams['breaks'],
            "stress": rcParams['stress'],
            "merge_vowels": rcParams['merge_vowels']
        }
        for k in defaults:
            if k not in keywords:
                keywords[k] = defaults[k]

        tokenize = lambda x: ipa2tokens(x, **keywords)

        # start to loop over data and create the stuff
        for k, (seqA, seqB) in enumerate(self.seqs):
            # get the tokens
            tokA, tokB = tokenize(seqA), tokenize(seqB)

            # get the prostrings
            proA, proB = \
                prosodic_string(tokA, **keywords), prosodic_string(tokB, **keywords)

            # append the stuff
            self.tokens += [[tokA, tokB]]
            self.prostrings += [[proA, proB]]
Exemplo n.º 11
0
    def test_prosodic_weights(self):
        seq = 'tʰ ɔ x t ə r'.split(' ')

        assert prosodic_weights(prosodic_string(seq))[0] == 2
        assert prosodic_weights(prosodic_string(seq))[-1] == 0.8
Exemplo n.º 12
0
def test_prosodic_weights():

    seq = 'tʰ ɔ x t ə r'.split(' ')

    assert prosodic_weights(prosodic_string(seq))[0] == 2
    assert prosodic_weights(prosodic_string(seq))[-1] == 0.8
Exemplo n.º 13
0
def cv_templates(wordlist,
                 language,
                 segments='tokens',
                 converter=None,
                 cutoff=0.1,
                 output='markdown',
                 examples=3,
                 scoredict=None,
                 splitter=False):
    """Create CV templates from wordlist data."""
    templates = defaultdict(list)
    idxs = wordlist.get_list(col=language, flat=True)
    sounds = defaultdict(list)

    def str_(list_):
        return ', '.join([' '.join(l) for l in list_[:examples]])

    if not converter:
        converter = lambda x: prosodic_string(x, _output='CcV')
    scoredict = scoredict or _scorer()
    if not splitter:
        splitter = lambda x: filter(None, tokens2morphemes(x))

    for idx in idxs:
        segs = wordlist[idx, segments]
        for word in splitter(segs):
            cv = converter(word)
            templates[cv] += [word]
            for sound, symbol in zip(word, cv):
                sounds[sound, symbol] += [word]

    # retrieve percentile
    lengths = sum([len(v) for v in templates.values()])
    perc = lengths - (cutoff * lengths)

    patterns, ignored = [], []
    score = 0
    for k, v in sorted(templates.items(),
                       key=lambda x: len(x[1]),
                       reverse=True):
        l = len(v)
        if score + l > perc:
            ignored += [[k, l, v]]
        else:
            patterns += [[k, l, v]]
        score += l

    # compute pattern consensus
    consensus = pattern_consensus([list(p[0]) for p in patterns], scoredict)

    # extract initials
    sound_table = []
    for k, v in sorted(sounds.items(), key=lambda x: (x[0][1], len(x[1]))):
        sound_table += [(k[0], k[1], len(v), v)]

    if output == 'markdown':
        out = 'Pattern | Frequency | Examples\n --- | --- | --- \n'
        score = 0
        for i, (p, l, v) in enumerate(patterns):
            out += '{0:15} | {1:5} | {2}\n'.format(p, l, str_(v))
            score += l
        count = 1
        out += '\nSound | Context | Frequency | Examples\n --- | --- | --- | --- \n'
        for sound, context, l, vals in sound_table:
            out += '{0} | {1} | {2} | {3} \n'.format(sound, context, l,
                                                     str_(vals))

        out += '\n* **coverage:** {0} out of {1} patterns in the data\n'.format(
            score, lengths)
        out += '* **pattern consensus:** {0}\n'.format(' '.join(consensus))
        return out

    return patterns, ignored, sound_table