def test_prosodic_string(self): seq = 'tʰ ɔ x t ə r'.split(' ') assert prosodic_string(seq) == 'AXMBYN' seq = 'th o x ¹ t e'.split(' ') assert prosodic_string(seq) == 'AXLTBZ' seq = 'th o x _ th o x'.split(' ') assert prosodic_string(seq) == 'AXN_AXN' assert not prosodic_string('')
def edit_dist(seqA, seqB, normalized=False, restriction=''): """ Return the edit distance between two strings. Parameters ---------- seqA,seqB : str The strings that shall be compared. normalized : bool (default=False) Specify whether the normalized edit distance shall be returned. If no restrictions are chosen, the edit distance is normalized by dividing by the length of the longer string. If *restriction* is set to *cv* (consonant-vowel), the edit distance is normalized by the length of the alignment. restriction : {"cv"} (default="") Specify the restrictions to be used. Currently, only ``cv`` is supported. This prohibits matches of vowels with consonants. Notes ----- The edit distance was first formally defined by V. I. Levenshtein (:evobib:`Levenshtein1965`). The first algorithm to compute the edit distance was proposed by Wagner and Fisher (:evobib:`Wagner1974`). Returns ------- dist : {int float} The edit distance, which is a float if normalized is set to c{True}, and an integer otherwise. Examples -------- >>> seqA = 'fat cat' >>> setB = 'catfat' >>> edit_dist(seqA, seqB) 3 """ # check whether the sequences are tuples if isinstance(seqA, (text_type, tuple)): seqA = list(seqA) seqB = list(seqB) elif not isinstance(seqA, list): raise ValueError("Input should be tuple, list, or string.") if restriction in ['cv', 'consonant-vowel']: resA = prosodic_string(seqA, 'cv') resB = prosodic_string(seqB, 'cv') return malign.restricted_edit_dist(seqA, seqB, resA, resB, normalized) return malign.edit_dist(seqA, seqB, normalized)
def evaluate_string(self, string, tokens=False, **keywords): setdefaults(keywords, stress=rcParams['stress'], diacritics=rcParams['diacritics'], cldf=False) if not tokens: tokens = ipa2tokens(string) score = 1 dist = self.dist['#'] prostring = prosodic_string(tokens, rcParams['art'], cldf=keywords['cldf'], diacritics=keywords['diacritics'], stress=keywords['stress']) if self.classes: c = tokens2class(tokens, self.model, cldf=keywords['cldf'], diacritics=keywords['diacritics'], stress=keywords['stress']) teststring = list(zip(prostring, c)) else: teststring = list(zip(prostring, tokens)) scores = [] while len(teststring) > 0: segment = teststring.pop(0) freq = dist.count(segment) allf = len(dist) s = freq / allf score = score * s scores += [s] dist = self.dist[segment] score = score * s scores += [s] lscore = np.log10(score) lscore = lscore / len(tokens) return score, lscore # np.log10(score)
def edit_dist(seqA, seqB, normalized=False, restriction=''): """ Return the edit distance between two strings. Parameters ---------- seqA,seqB : str The strings that shall be compared. normalized : bool (default=False) Specify whether the normalized edit distance shall be returned. If no restrictions are chosen, the edit distance is normalized by dividing by the length of the longer string. If *restriction* is set to *cv* (consonant-vowel), the edit distance is normalized by the length of the alignment. restriction : {"cv"} (default="") Specify the restrictions to be used. Currently, only ``cv`` is supported. This prohibits matches of vowels with consonants. Notes ----- The edit distance was first formally defined by V. I. Levenshtein (:evobib:`Levenshtein1965`). The first algorithm to compute the edit distance was proposed by Wagner and Fisher (:evobib:`Wagner1974`). Returns ------- dist : {int float} The edit distance, which is a float if normalized is set to c{True}, and an integer otherwise. Examples -------- Align two sequences:: >>> seqA = 'fat cat' >>> seqB = 'catfat' >>> edit_dist(seqA, seqB) 3 """ seqA, seqB = _as_lists(seqA, seqB) if restriction in ['cv', 'consonant-vowel']: resA = prosodic_string(seqA, 'cv') resB = prosodic_string(seqB, 'cv') return malign.restricted_edit_dist(seqA, seqB, resA, resB, normalized) return malign.edit_dist(seqA, seqB, normalized)
def __init__(self, words, tokens=False, prostrings=[], classes=False, class_model=rcParams['model'], **keywords): setdefaults(keywords, stress=rcParams['stress'], diacritics=rcParams['diacritics'], cldf=False) self.model = class_model self.words = words self.tokens = [] self.bigrams = [] self.classes = [] # start filling the dictionary for i, w in enumerate(words): # check for tokenized string if not tokens: tk = ipa2tokens(w, **keywords) else: tk = w[:] self.tokens += [tk] # create prosodic string if prostrings: p = prostrings[i] else: print(w, tk) tt = tokens2class(tk, rcParams['art']) print(tt) p = prosodic_string(tk, rcParams['art'], cldf=keywords['cldf'], diacritics=keywords['diacritics'], stress=keywords['stress']) # create classes if classes: c = tokens2class(tk, class_model, cldf=keywords['cldf'], diacritics=keywords['diacritics'], stress=keywords['stress']) bigrams = list(zip(p, c)) self.classes += [c] else: # zip the stuff bigrams = list(zip(p, tk)) # start appending the stuff self.bigrams += [bigrams] # init the mother object MCBasic.__init__(self, self.bigrams)
def __init__( self, words, tokens=False, prostrings=[], classes=False, class_model=rcParams['model'], **keywords ): setdefaults(keywords, stress=rcParams['stress'], diacritics=rcParams['diacritics'], cldf=False) self.model = class_model self.words = words self.tokens = [] self.bigrams = [] self.classes = [] # start filling the dictionary for i, w in enumerate(words): # check for tokenized string if not tokens: tk = ipa2tokens(w, **keywords) else: tk = w[:] self.tokens += [tk] # create prosodic string if prostrings: p = prostrings[i] else: tt = tokens2class(tk, rcParams['art']) p = prosodic_string( tk, rcParams['art'], cldf=keywords['cldf'], diacritics=keywords['diacritics'], stress=keywords['stress']) # create classes if classes: c = tokens2class(tk, class_model, cldf=keywords['cldf'], diacritics=keywords['diacritics'], stress=keywords['stress']) bigrams = list(zip(p, c)) self.classes += [c] else: # zip the stuff bigrams = list(zip(p, tk)) # start appending the stuff self.bigrams += [bigrams] # init the mother object MCBasic.__init__(self, self.bigrams)
def __init__(self, seqs, seqB=False, **keywords): # check, whether there are only two sequences or multiple sequence # pairs as input if seqB: self.seqs = [(seqs, seqB)] else: self.seqs = seqs # add the basic representation of sequences self.tokens = [] self.prostrings = [] # define a tokenizer function for convenience defaults = { "diacritics": rcParams['diacritics'], "vowels": rcParams['vowels'], "tones": rcParams['tones'], "combiners": rcParams['combiners'], "breaks": rcParams['breaks'], "stress": rcParams['stress'], "merge_vowels": rcParams['merge_vowels'] } for k in defaults: if k not in keywords: keywords[k] = defaults[k] tokenize = lambda x: ipa2tokens(x, **keywords) # start to loop over data and create the stuff for k, (seqA, seqB) in enumerate(self.seqs): # get the tokens tokA, tokB = tokenize(seqA), tokenize(seqB) # get the prostrings proA, proB = \ prosodic_string(tokA, **keywords), prosodic_string(tokB, **keywords) # append the stuff self.tokens += [[tokA, tokB]] self.prostrings += [[proA, proB]]
def test_prosodic_weights(self): seq = 'tʰ ɔ x t ə r'.split(' ') assert prosodic_weights(prosodic_string(seq))[0] == 2 assert prosodic_weights(prosodic_string(seq))[-1] == 0.8
def test_prosodic_weights(): seq = 'tʰ ɔ x t ə r'.split(' ') assert prosodic_weights(prosodic_string(seq))[0] == 2 assert prosodic_weights(prosodic_string(seq))[-1] == 0.8
def cv_templates(wordlist, language, segments='tokens', converter=None, cutoff=0.1, output='markdown', examples=3, scoredict=None, splitter=False): """Create CV templates from wordlist data.""" templates = defaultdict(list) idxs = wordlist.get_list(col=language, flat=True) sounds = defaultdict(list) def str_(list_): return ', '.join([' '.join(l) for l in list_[:examples]]) if not converter: converter = lambda x: prosodic_string(x, _output='CcV') scoredict = scoredict or _scorer() if not splitter: splitter = lambda x: filter(None, tokens2morphemes(x)) for idx in idxs: segs = wordlist[idx, segments] for word in splitter(segs): cv = converter(word) templates[cv] += [word] for sound, symbol in zip(word, cv): sounds[sound, symbol] += [word] # retrieve percentile lengths = sum([len(v) for v in templates.values()]) perc = lengths - (cutoff * lengths) patterns, ignored = [], [] score = 0 for k, v in sorted(templates.items(), key=lambda x: len(x[1]), reverse=True): l = len(v) if score + l > perc: ignored += [[k, l, v]] else: patterns += [[k, l, v]] score += l # compute pattern consensus consensus = pattern_consensus([list(p[0]) for p in patterns], scoredict) # extract initials sound_table = [] for k, v in sorted(sounds.items(), key=lambda x: (x[0][1], len(x[1]))): sound_table += [(k[0], k[1], len(v), v)] if output == 'markdown': out = 'Pattern | Frequency | Examples\n --- | --- | --- \n' score = 0 for i, (p, l, v) in enumerate(patterns): out += '{0:15} | {1:5} | {2}\n'.format(p, l, str_(v)) score += l count = 1 out += '\nSound | Context | Frequency | Examples\n --- | --- | --- | --- \n' for sound, context, l, vals in sound_table: out += '{0} | {1} | {2} | {3} \n'.format(sound, context, l, str_(vals)) out += '\n* **coverage:** {0} out of {1} patterns in the data\n'.format( score, lengths) out += '* **pattern consensus:** {0}\n'.format(' '.join(consensus)) return out return patterns, ignored, sound_table