예제 #1
0
def get_structure_profile(wordlist,
                          column='ipa',
                          text=False,
                          semi_diacritics='hsʃʂʒʐzθɕʑfvθð',
                          debug=False,
                          language=None):
    profile = defaultdict(list)
    modify = lambda x: x
    if column == 'ipa':
        modify = lambda x: x.replace(' ', '_')

    for idx, lang, segments in lingpy.iter_rows(wordlist, 'doculect', column):
        if debug: print(idx, lang, segments)
        if not language or language == lang:
            for structure, morpheme in get_structure(
                    modify(segments), zipped=True,
                    semi_diacritics=semi_diacritics):
                im, nc, t = [[], []], [[], []], [[], []]
                for pos, seg in structure:
                    if pos in 'i':
                        im[0] += [pos]
                        im[1] += [seg]
                    elif pos in 'mnc':
                        nc[0] += [pos]
                        nc[1] += [seg]
                    else:
                        t[0] += [pos]
                        t[1] += [seg]
                if im[0]:
                    profile[' '.join(im[0]),
                            ' '.join(im[1])] += [(lang, morpheme)]
                if nc[0]:
                    profile[' '.join(nc[0]),
                            ' '.join(nc[1])] += [(lang, morpheme)]
                if t[0]:
                    profile[' '.join(t[0]),
                            ' '.join(t[1])] += [(lang, morpheme)]
    for (pos, seg), langs_ in sorted(profile.items(),
                                     key=lambda x: (x[0][0], len(x[1])),
                                     reverse=True):
        langs = [x[0] for x in langs_]
        examples = [''.join(x[1]) for x in langs_]
        if not text:
            yield (seg.replace(' ',
                               ''), seg, seg, pos, codepoint(s), len(langs),
                   ','.join(sorted(set(langs), key=lambda x: langs.count(x))),
                   ', '.join(examples[:5]))
        else:
            yield '\t'.join([
                strip_chars(' ∼', seg), seg, seg, pos,
                codepoint(seg),
                str(len(langs)),
                ','.join(sorted(set(langs), key=lambda x: langs.count(x))),
                ', '.join(examples[:5])
            ])
예제 #2
0
 def test_codepoint(self):
     assert codepoint('á') == 'U+00e1'
예제 #3
0
def simple_profile(wordlist,
                   ref='ipa',
                   semi_diacritics='hsʃ̢ɕʂʐʑʒw',
                   merge_vowels=False,
                   brackets=None,
                   splitters='/,;~',
                   merge_geminates=True,
                   bad_word="<???>",
                   bad_sound="<?>",
                   clts=None,
                   unknown_sound="!{0}"):
    """
    Create an initial Orthography Profile using Lingpy's clean_string procedure.

    Parameters
    ----------
    wordlist : ~lingpy.basic.wordlist.Wordlist
        A wordlist from which you want to derive an initial
        orthography profile.
    ref : str (default="ipa")
        The name of the reference column in which the words are stored.
    semi_diacritics : str
        Indicate characters which can occur both as "diacritics" (second part
        in a sound) or alone.
    merge_vowels : bool (default=True)
        Indicate whether consecutive vowels should be merged.
    brackets : dict
        A dictionary with opening brackets as key and closing brackets as
        values. Defaults to a pre-defined set of frequently occurring brackets.
    splitters : str
        The characters which force the automatic splitting of an entry.
    clts : dict (default=None)
        A dictionary(like) object that converts a given source sound into a
        potential target sound, using the get()-method of the dictionary.
        Normally, we think of a CLTS instance here (that is: a cross-linguistic
        transcription system as defined in the pyclts package).
    bad_word : str (default="«???»")
        Indicate how words that could not be parsed should be handled. Note
        that both "bad_word" and "bad_sound" are format-strings, so you can add
        formatting information here.
    bad_sound : str (default="«?»")
        Indicate how sounds that could not be converted to a sound class be
        handled. Note that both "bad_word" and "bad_sound" are format-strings,
        so you can add formatting information here.
    unknown_sound : str (default="!{0}")
        If with_clts is set to True, use this string to indicate that sounds
        are classified as "unknown sound" in the CLTS framework.    
    
    Returns
    -------
    profile : generator
        A generator of tuples (three items), indicating the segment, its frequency,
        the conversion to sound classes in the Dolgopolsky sound-class model,
        and the unicode-codepoints.
    """
    clts = clts or {}
    nulls = set()
    bad_words = set()
    brackets = brackets or "([{『(₍⁽«)]})』⁾₎"
    profile = defaultdict(int)
    words = [wordlist[idx, ref] for idx in wordlist]
    for word in pb(words, desc='iterating over words'):
        if isinstance(word, list):
            word = ' '.join(word)
        cleaned_string = clean_string(word,
                                      semi_diacritics=semi_diacritics,
                                      merge_vowels=merge_vowels,
                                      brackets=None,
                                      ignore_brackets=False,
                                      split_entries=False,
                                      preparse=None,
                                      rules=None,
                                      merge_geminates=merge_geminates)[0]

        # retain whole word if there are splitters in the word
        if [x for x in cleaned_string if x in brackets + splitters]:
            profile[word] += 1
            bad_words.add(word)
        else:
            for segment in cleaned_string.split(' '):
                profile[segment] += 1
            for segment in [x for x in word if x not in cleaned_string]:
                profile[segment] += 1
                nulls.add(segment)

    for s, f in pb(sorted(profile.items(), key=lambda x: x[1], reverse=True),
                   desc='preparing profile'):
        sclass = token2class(s, 'dolgo')
        if s in bad_words:
            ipa = bad_word.format(s)
        elif sclass == '0' and s not in nulls:
            ipa = bad_sound.format(s)
        elif s in nulls:
            ipa = 'NULL'
        elif clts:
            sound = clts.get(s, False)
            if not sound:
                ipa = '!' + s
            else:
                ipa = text_type(sound)
        else:
            ipa = s
        yield s, ipa, text_type(f), codepoint(s)
예제 #4
0
def context_profile(wordlist,
                    ref='ipa',
                    col="doculect",
                    semi_diacritics='hsʃ̢ɕʂʐʑʒw',
                    merge_vowels=False,
                    brackets=None,
                    splitters='/,;~',
                    merge_geminates=True,
                    clts=False,
                    bad_word="<???>",
                    bad_sound="<?>",
                    unknown_sound="!{0}",
                    examples=2,
                    max_entries=100):
    """
    Create an advanced Orthography Profile with context and doculect information.

    Parameters
    ----------
    wordlist : ~lingpy.basic.wordlist.Wordlist
        A wordlist from which you want to derive an initial
        orthography profile.
    ref : str (default="ipa")
        The name of the reference column in which the words are stored.
    col : str (default="doculect")
        Indicate in which column the information on the language variety is
        stored.
    semi_diacritics : str
        Indicate characters which can occur both as "diacritics" (second part
        in a sound) or alone.
    merge_vowels : bool (default=True)
        Indicate whether consecutive vowels should be merged.
    brackets : dict
        A dictionary with opening brackets as key and closing brackets as
        values. Defaults to a pre-defined set of frequently occurring brackets.
    splitters : str
        The characters which force the automatic splitting of an entry.
    clts : dict (default=None)
        A dictionary(like) object that converts a given source sound into a
        potential target sound, using the get()-method of the dictionary.
        Normally, we think of a CLTS instance here (that is: a cross-linguistic
        transcription system as defined in the pyclts package).
    bad_word : str (default="«???»")
        Indicate how words that could not be parsed should be handled. Note
        that both "bad_word" and "bad_sound" are format-strings, so you can add
        formatting information here.
    bad_sound : str (default="«?»")
        Indicate how sounds that could not be converted to a sound class be
        handled. Note that both "bad_word" and "bad_sound" are format-strings,
        so you can add formatting information here.
    unknown_sound : str (default="!{0}")
        If with_clts is set to True, use this string to indicate that sounds
        are classified as "unknown sound" in the CLTS framework.
    examples : int(default=2)
        Indicate the number of examples that should be printed out.

    Returns
    -------
    profile : generator
        A generator of tuples (three items), indicating the segment, its frequency,
        the conversion to sound classes in the Dolgopolsky sound-class model,
        and the unicode-codepoints.
    """
    clts_ = clts or {}
    nulls = set()
    bad_words = set()
    brackets = brackets or "([{『(₍⁽«)]})』⁾₎"
    profile = defaultdict(list)
    errors = set()
    for idx, word, language in pb(wordlist.iter_rows(ref, col),
                                  desc='iter words',
                                  total=len(wordlist)):
        log.info('processing {0}-{1}'.format(idx, word))
        if isinstance(word, list):
            word = ' '.join(word)
        if word.strip():
            try:
                cleaned_string = clean_string(
                    word,
                    semi_diacritics=semi_diacritics,
                    merge_vowels=merge_vowels,
                    brackets=None,
                    ignore_brackets=False,
                    split_entries=False,
                    preparse=None,
                    rules=None,
                    merge_geminates=merge_geminates)[0].split(' ')

                # retain whole word if there are splitters in the word
                if [x for x in cleaned_string if x in brackets + splitters]:
                    profile[word] += [(language, word)]
                    bad_words.add(word)
                else:
                    context_pre = ['^'] + (len(cleaned_string) - 1) * ['']
                    context_post = (len(cleaned_string) - 1) * [''] + ['$']
                    for ctxA, ctxB, segment in zip(context_pre, context_post,
                                                   cleaned_string):
                        profile[ctxA + segment + ctxB] += [(language, word)]
                    for segment in [
                            x for x in word
                            if x not in ' '.join(cleaned_string)
                    ]:
                        profile[segment] += [(language, word)]
                        nulls.add(segment)
            except:
                errors.add(idx)
                log.warn('problem parsing {0}'.format(word))

    for s in '^$':
        yield s, 'NULL', '', '', '', ''

    for idx, (s, entries) in pb(enumerate(
            sorted(profile.items(), key=lambda x: len(x[1]), reverse=True)),
                                desc='yielding entries',
                                total=len(profile)):
        sclass = token2class(s.strip('^$'), 'dolgo')
        words, langs = [l[1] for l in entries
                        ][:max_entries], [l[0] for l in entries][:max_entries]
        languages = ', '.join(
            sorted(set(langs), key=lambda x: langs.count(x), reverse=True))
        frequency = str(len(langs))
        codepoints = codepoint(s)
        examples_ = ', '.join(
            sorted(set(words), key=lambda x: words.count(x),
                   reverse=True)[:examples])
        if s in bad_words:
            ipa = bad_word.format(s)
        elif sclass == '0':
            ipa = bad_sound.format(s)
        elif s in nulls:
            ipa = 'NULL'
        elif clts_:
            sound = clts_.get(s.strip('^$'), False)
            if not sound:
                ipa = '!' + s.strip('^$')
            else:
                ipa = text_type(sound)
        else:
            ipa = s.strip('^$')

        yield s, ipa, examples_, languages, frequency, codepoints
예제 #5
0
 def test_codepoint(self):
     assert codepoint('á') == 'U+00e1'
예제 #6
0
def test_codepoint():
    from lingpy.sequence.sound_classes import codepoint
    assert codepoint('á') == 'U+00e1'