Exemplo n.º 1
0
 def test_sonority_sequencing_syllable_tokenizer(self):
     """
     Test SyllableTokenizer tokenizer.
     """
     tokenizer = SyllableTokenizer()
     tokens = tokenizer.tokenize('justification')
     self.assertEqual(tokens, ['jus', 'ti', 'fi', 'ca', 'tion'])
 def test_sonority_sequencing_syllable_tokenizer(self):
     """
     Test SyllableTokenizer tokenizer.
     """
     tokenizer = SyllableTokenizer()
     tokens = tokenizer.tokenize("justification")
     assert tokens == ["jus", "ti", "fi", "ca", "tion"]
Exemplo n.º 3
0
def create_vocabulary(vocabulary_path,
                      data_path,
                      max_vocabulary_size,
                      tokenizer=None,
                      normalize_digits=True):
    """Create vocabulary file (if it does not exist yet) from data file.

  Data file is assumed to contain one sentence per line. Each sentence is
  tokenized and digits are normalized (if normalize_digits is set).
  Vocabulary contains the most-frequent tokens up to max_vocabulary_size.
  We write it to vocabulary_path in a one-token-per-line format, so that later
  token in the first line gets id=0, second line gets id=1, and so on.

  Args:
    vocabulary_path: path where the vocabulary will be created.
    data_path: data file that will be used to create vocabulary.
    max_vocabulary_size: limit on the size of the created vocabulary.
    tokenizer: a function to use to tokenize each data sentence;
      if None, basic_tokenizer will be used.
    normalize_digits: Boolean; if true, all digits are replaced by 0s.
  """
    if not gfile.Exists(vocabulary_path):
        print("Creating vocabulary %s from data %s" %
              (vocabulary_path, data_path))
        vocab = {}

        trans_list = [
            "ai", "ay", "ei", "ea", "ey", "ee", "ew", "eu", "oo", "oa", "ou",
            "ie", "ch", "ck", "tc", "bt", "gh", "dg", "th", "sh", "gn", "mb",
            "mn", "kn", "wh", "ng", "ph", "wr", "er", "or", "an", "al", "wa",
            "or", "ar", "ig", "qu", "il", "in", "al", "ow", "oy", "au"
        ]
        with gfile.GFile(data_path, mode="rb") as f:
            counter = 0
            for line in f:
                counter += 1
                if counter % 100000 == 0:
                    print("  processing line %d" % counter)
                line = tf.compat.as_bytes(line)
                tokens = tokenizer(line) if tokenizer else basic_tokenizer(
                    line)

                for w in tokens:
                    word = _DIGIT_RE.sub(
                        b"0", tf.compat.as_bytes(w)) if normalize_digits else w
                    if word in vocab:
                        vocab[word] += 1
                    else:
                        vocab[word] = 1
            #vocab_list = _START_VOCAB + sorted(vocab, key=vocab.get, reverse=True)
            vocab_list = _START_VOCAB + sorted(trans_list)
            if len(vocab_list) > max_vocabulary_size:
                vocab_list = vocab_list[:max_vocabulary_size]
            with gfile.GFile(vocabulary_path, mode="wb") as vocab_file:
                for w in vocab_list:
                    vocab_file.write(tf.compat.as_bytes(w) + b"\n")
                    #vocab_file.write(w + b"\n")

            SSP = SyllableTokenizer()
            print("mem is: ", SSP.tokenize('justification'))
Exemplo n.º 4
0
def syllabify_orth_with_nltk(token, num_sylls=None):
    global nltk_ssp
    if not nltk_ssp:
        from nltk.tokenize import SyllableTokenizer
        nltk_ssp = SyllableTokenizer()
    l = nltk_ssp.tokenize(token)
    return l
Exemplo n.º 5
0
def getSyllables(request, text):
    textSyllables = []
    SSP = SyllableTokenizer()
    tokenised_sentences = nltk.sent_tokenize(text)
    for sentence in tokenised_sentences:
        tokenised_words = nltk.word_tokenize(sentence)
        #tagged_words = nltk.pos_tag(tokenised_words)
        for word in tokenised_words:
            tokenised_syllables = SSP.tokenize(word)
            #textSyllables = textSyllables.join(tokenised_syllables)
            textSyllables += tokenised_syllables

    return JsonResponse(textSyllables, safe=False)
def syllableCount(cleansedWordsList):
    wordCountThisDocument = 0
    totalSyllsThisDocument = 0
    SSP = SyllableTokenizer()
    for words in cleansedWordsList:
        numSyllThisWord = len(SSP.tokenize(words.lower()))
        if numSyllThisWord > 0 and words:  # Run if number of syllables > 0 and list entry not null
            wordCountThisDocument = wordCountThisDocument + 1
            totalSyllsThisDocument = totalSyllsThisDocument + numSyllThisWord
    if wordCountThisDocument > 0:
        averageNumSyllables = totalSyllsThisDocument / wordCountThisDocument
    else:
        averageNumSyllables = "N/A"
    averageNumSyllableList.append(averageNumSyllables)
Exemplo n.º 7
0
def syllabify_orth_with_nltk(token,num_sylls=None):
    global nltk_ssp
    if not nltk_ssp:
        from nltk.tokenize import SyllableTokenizer
        nltk_ssp = SyllableTokenizer()
    tokenl=token.lower()
    l = nltk_ssp.tokenize(tokenl)
    if tokenl!=token:
        o=[]
        i=0
        for x in l:
            xlen=len(x)
            o+=[token[i:i+xlen]]
            i+=xlen
        l=o
    return l
Exemplo n.º 8
0
 def __init__(self, dictName='-', lang=None):
     # Sonority Sequencing Tokenizer defaults to 26 latin letters,
     # english pronunciation.
     self.SSP = SyllableTokenizer()
     self.changeLang(lang)
     self.dict = {"words": []}
     if (dictName == '-'):
         dictName = os.path.dirname(__file__) + "/dict.yaml"
     try:
         with open(dictName) as f:
             self.dict = yaml.safe_load(f)
     except BaseException:
         error("%s could not be loaded." % dictName)
     # CMU Pronunciation dictionary includes 119K+ english words plus some
     # proper nouns using the latin alphabet, occasionally with punctuation.
     self.d = cmudict.dict()
Exemplo n.º 9
0
def run(words):
    # parts = [decompose(x, stub=4) for x in words]
    SSP = SyllableTokenizer()
    parts = [SSP.tokenize(x) for x in words]
    portmanteau = stitch("", parts)
    return portmanteau
Exemplo n.º 10
0
import re
from num2words import num2words
from typing import List, Any
import nltk
from nltk.corpus import cmudict
from nltk.tokenize import SyllableTokenizer
import wordninja
# nltk.download('cmudict')

pro_dict = cmudict.dict()
ssp = SyllableTokenizer()


def count_syllables(words: List[str]) -> set:
    word_syl_dict = {word: [] for word in words}

    for word in words:
        syllable_nums = cmudict_syl(word)
        if syllable_nums is False:
            syllable_nums = [ssp_syl(word)]
        word_syl_dict[word] = syllable_nums

    total_syl_set = set()
    recur_count_syl(0, word_syl_dict, words.copy(), total_syl_set)

    return total_syl_set


def recur_count_syl(syl_count: int, word_syl_dict: dict, words: List[str],
                    total_syl_set: set):
    while words:
Exemplo n.º 11
0
class MultiSylT(object):
    def __init__(self, dictName='-', lang=None):
        # Sonority Sequencing Tokenizer defaults to 26 latin letters,
        # english pronunciation.
        self.SSP = SyllableTokenizer()
        self.changeLang(lang)
        self.dict = {"words": []}
        if (dictName == '-'):
            dictName = os.path.dirname(__file__) + "/dict.yaml"
        try:
            with open(dictName) as f:
                self.dict = yaml.safe_load(f)
        except BaseException:
            error("%s could not be loaded." % dictName)
        # CMU Pronunciation dictionary includes 119K+ english words plus some
        # proper nouns using the latin alphabet, occasionally with punctuation.
        self.d = cmudict.dict()

    def changeLang(self, lang):
        if lang not in pyphen.LANGUAGES:
            lang = 'en'
        self.pyphen = pyphen.Pyphen(lang=lang)
        self.lang = lang

    def multiTokenize(self, originalWord):
        """ Return options for tokenizing a word. """
        word = self.deformat(originalWord)
        tokenizations = []
        # If the word exists in our dictionary, include those tokenizations.
        if (word in self.dict['words']):
            tokenizations += self.dict['words'][word]
        # If the word exists in the dictionary (but as singular), include it.
        elif (word[-1] == 's' and word[0:-1] in self.dict['words']):
            for tk in self.dict['words'][word[0:-1]]:
                nt = tk.copy()
                nt[-1] = nt[-1] + 's'
                tokenizations.append(nt)

        # Otherwise, use an algorithm to get word split up into syllables
        tokenized = self.SSP.tokenize(word)
        splitter = "\t"
        hyphenated = self.pyphen.inserted(word, splitter).split(splitter)

        if self.lang == 'en':
            tokenizations = self._addMatchingSylCount(word, tokenizations,
                                                      tokenized, hyphenated)
        elif self.lang == 'es':
            # Sonority Sequencing doesn't work well with strong and weak vowels
            esTokenized = self._spanishTokenize(word)
            if esTokenized not in tokenizations:
                tokenizations.append(esTokenized)
            # Hunspell tokenizations are not as accurate as our tokenized ones:
            # only include them if the syllable count matches.
            if hyphenated not in tokenizations and len(hyphenated) == len(
                    esTokenized):
                tokenizations.append(hyphenated)
        else:
            if tokenized not in tokenizations:
                tokenizations.append(tokenized)
            if hyphenated not in tokenizations:
                tokenizations.append(hyphenated)
        return list(
            map(self.reformat, tokenizations,
                [originalWord for x in range(0, len(tokenizations))]))

    def _addMatchingSylCount(self, word, tokenizations, tokenized, hyphenated):
        sylCounts = self.nsyl(word)
        # If the tokenized or hyphenated version has the same number of
        # syllables as one of the CMU STT pronunciations, but we don't
        # already have that syllable-count represented, include it.
        lh = len(hyphenated)
        if (lh in sylCounts and lh not in map(len, tokenizations)):
            tokenizations.append(hyphenated)
        lt = len(tokenized)
        if (lt in sylCounts and lt not in map(len, tokenizations)):
            tokenizations.append(tokenized)
        if (1 in sylCounts and [word] not in tokenizations):
            tokenizations.append([word])

        # Fallback if there are no tokenizations.
        if (len(tokenizations) == 0):
            warning("%s has %d syllables," %
                    (str(hyphenated), len(hyphenated)) + ' expected: ' +
                    (" or ".join(map(str, sylCounts)) or "???"))
            tokenizations.append(hyphenated)
        return tokenizations

    def _spanishTokenize(self, word):
        """ Make sure spanish hyphenated syllable counts are correct
        https://www.spanishdict.com/guide/spanish-syllables-and-syllabification-rules
        """
        # Accented vowels always get their own syllable.
        accentedVowels = "áéíóú"
        # Two strong vowels together are split into different syllables.
        strongVowels = "aeo"
        # Weak vowels can blend with each other, or with strong vowels.
        weakVowels = "iuü"
        vowels = accentedVowels + strongVowels + weakVowels

        # Split certain vowel pairs, and let SSP do the rest.
        newWord = ""
        prevLetter = " "
        for letter in word:
            if (letter in vowels and prevLetter in accentedVowels) or \
               (letter in accentedVowels and prevLetter in vowels) or \
               (letter in strongVowels and prevLetter in strongVowels):
                newWord += "-" + letter
            else:
                newWord += letter
            prevLetter = letter
        # TODO: Fix tokenization for double-r and double-l
        tokenized = self.SSP.tokenize(newWord)
        return list(filter(lambda syl: syl != '-', tokenized))

    def deformat(self, word):
        return re.sub("[" + wordSyl.smartSingles + "]", "'",
                      word.lower().strip(wordSyl.puncs))

    def reformat(self, oldTokenized, template):
        # Since tokenized is mutable, create a duplicate of it.
        tokenized = list(oldTokenized)

        # Match the case
        plainTemp = template.strip(wordSyl.puncs)
        if (plainTemp and plainTemp.isupper()):
            tokenized[0] = tokenized[0].upper()
        elif (plainTemp and plainTemp[0].isupper()):
            tokenized[0] = tokenized[0][0].upper() + tokenized[0][1:]
        elif (plainTemp and len(tokenized[0]) > 1 and len(plainTemp) > 1
              and plainTemp[1].isupper()):
            tokenized[0] = tokenized[0][0] + \
                tokenized[0][1].upper() + tokenized[0][2:]

        # Prepend/append the punctuations
        match = re.search(r"^[" + wordSyl.puncs + r"]+", template)
        starting = match.group(0) if match else ''
        match = re.search(r"[" + wordSyl.puncs + r"]+$", template)
        ending = match.group(0) if match else ''
        tokenized[0] = starting + tokenized[0]
        tokenized[-1] = tokenized[-1] + ending

        # Replace smart single-quotes
        dumbPlaceholder = "\n"
        splitter = "\t"
        templateNoDumb = template.replace("'", dumbPlaceholder)
        for letter in templateNoDumb:
            if letter in wordSyl.smartSingles + dumbPlaceholder:
                tokenized = splitter.join(tokenized).replace("'", letter,
                                                             1).split(splitter)
        tokenized = splitter.join(tokenized).replace(dumbPlaceholder,
                                                     "'").split(splitter)

        return tokenized

    def nsyl(self, word):
        """Get the number of syllables a word should be
        from the CMU Pronunciation dictionary.
        Returned as a list to account for variants."""
        if word in self.d:
            return [
                len(list(y for y in x if y[-1].isdigit()))
                for x in self.d[word.lower()]
            ]
        return []