Пример #1
  def __init__(self, dataset="picasso2", basedir="parsed_data"):
    self.dataset = dataset
    self.basedir = basedir
    filename = "%s/%s/source/%s" % (basedir,dataset,dataset)
    self.debug("poemparser:init:dataset parsing '%s'..." % filename)

    with open("pyParser/english_words.txt") as word_file:
      self.english_words = set(word.strip().lower() for word in word_file)

    # Open and analyze the text data.
    self.unknownWords   = {}
    self.iffyWords      = {}
    self.allmatch       = {}
    self.alltokens      = self.openTokens(filename)
    self.parsedTokens   = [token for token in self.alltokens[0] if token != '-']
    self.replacedTokens = [token for token in self.alltokens[1] if token != '-']
    self.fullTokens     = [token for token in self.alltokens[2] if token != '-']
    self.tokens         = self.parsedTokens
    self.loweredTokens  = [token.lower() for token in self.replacedTokens]
    self.pos_tags       = nltk.pos_tag(self.replacedTokens)
    self.text           = nltk.Text(self.tokens)
    self.dict           = cmudict.dict() 
    self.lastspeed      = 0
    self.midiindex      = 0
    self.debug("poemparser:init:words %s"  % self.fullTokens)
    self.debug("poemparser:init:tokens %s" % self.tokens)
    self.debug("poemparser:init:text %s"   % self.text)
Пример #2
def reset_country_codes_to_emoflags(cc_path='country_codes.txt',
        irange=ET.FLAGS_RANGE, charset='utf-8'):
    Using a country code dict, set the name and syllable fields
    in a copy of emo_tuples.
    cmu_prons = cmudict.dict() # get the CMU Pronouncing Dict
    cc_dict = load_country_codes(cc_path)

    for tup in ET.EMO_TUPLES[irange.start:irange.stop]:
        cc2 = tup[ET.INDEX_ALTERNATIVES][0].strip(':').upper()
        # print(cc2, '  ', end='')
        monos, polys, names = [], [], [cc2]
        names.extend(nm for nm in tup[ET.INDEX_POLYSYLLABLES] if len(nm) > 2)
            # print(names, file=sys.stderr)
        except KeyError:
            print("{} missing {}\n\tusing: {}".format(
                   cc2, tup, names), file=sys.stderr)
        for name in set(names):
            if sylc.syl_count(cmu_prons, name) == 1:
        tupal = list(tup)
        tupal[ET.INDEX_WORDSYLLABLES] = monos
        tupal[ET.INDEX_POLYSYLLABLES] = polys
        ret = tuple(tupal)
        print("    {},".format(ret), file=sys.stdout)
        # tupal[ET.INDEX_WORDSYLLABLES] =
Пример #3
	def stress(self,bysentence=False):
		tokenizes (I guess) the words in self.text by the stress pattern in each of the words.
		vowels = ['A','E','I','O','U']
		possible_stresses = ['1','2','0']
		totaldic = cmudict.dict()
		def gen_stress(stripped_text):
			stress_list = []
			for word in stripped_text.lower().split():
					stress = str()
					phonemized = totaldic[word][0]
					for phoneme in phonemized:
						for stresser in possible_stresses:
							if stresser in phoneme:
								stress += stresser
					for index, sound in enumerate(phonemized[len(phonemized)-2:len(phonemized)]):
						for vowel in vowels:
							if vowel in sound:
								stress_list.append([word,stress,[index, sound],phonemized,len(phonemized)])
				except KeyError:
					# print("{} couldn't be found".format(word))
			return stress_list

		if bysentence:
			sentences = PunktSentenceTokenizer().tokenize(master_str)
			stress_by_sentence = [sentence.translate(string.maketrans("",""), string.punctuation) for sentence in sentences]
			return [gen_stress(sentence) for sentence in stress_by_sentence]

		elif not bysentence:
			stress_total = self.text.translate(string.maketrans("",""), string.punctuation) 
			return gen_stress(stress_total)
Пример #4
 def __compliant_haiku(self, haiku_source):
     """Ensure that newlines remain and all 
     other punctuation has been stripped"""
     """Ensure that newlines remain and all 
     other punctuation has been stripped"""
     dict = cmudict.dict()
     haiku_lines = haiku_source.splitlines()
     syllables = []
     for line in haiku_lines:
         if line == "":
         for word in line.split(" "):
             sal.append(len([x for x in dict[word][0] if x[-1].isdigit()]))
     pattern = [5,7,5]
     if len(syllables) % 3 == 0:
         while len(syllables) > 0:
             if syllables[:3] == pattern:
                 for x in range(2,-1,-1):
                 return False
         return False
     return True
Пример #5
 def __init__(self,text):
     # Initialize vars
     self.sent_count = 0
     self.word_count = 0
     self.syll_count = 0
     self.cmu = cmudict.dict()
Пример #6
def approx_nsyl(word):
	"""Credit - Jason Sundram, http://runningwithdata.com/post/3576752158/w
	Return the max syllable count in the case of multiple pronunciations"""
	d = cmudict.dict()
	if word not in d.keys():
		return 0
	return max([len([y for y in x if y[-1].isdigit()]) for x in d[word.lower()]])
Пример #7
def group_rhyming_tweets(filtered_tweet_list):
    """groups rhyming tweets into lists, then returns a list containing those lists. lists are sorted so that the list with the most rhyming words
    is first in the list."""
    copy_filtered_tweet_list = list(filtered_tweet_list)
    dictionary = cmudict.dict()
    grouped_rhyming_tweets = []
    index = 0
    while (
        index < len(copy_filtered_tweet_list) - 1
    ):  # don't need to check last element for rhymes against other words b/c all pairs of words checked already by that point
        rhyme_list = [copy_filtered_tweet_list[index]]
        i = index + 1
        while i < len(copy_filtered_tweet_list):
            if (
                do_sentences_rhyme(copy_filtered_tweet_list[index], copy_filtered_tweet_list[i], dictionary)
                or sentence_rhyme_score(copy_filtered_tweet_list[index], copy_filtered_tweet_list[i]) > 4
                i = i - 1
            i = i + 1
        rhyme_list = list(set(rhyme_list))  # remove non-unique entries by converting to a set and back again
        index = index + 1
    # grouped_rhyming_tweets = sorted(grouped_rhyming_tweets, key = len, reverse = True)
    grouped_rhyming_tweets = [i for i in grouped_rhyming_tweets if len(i) > 1]
    return grouped_rhyming_tweets
Пример #8
	def compile_meter_list(self, new_words, verbose=True):
	    # simplifies and compiles cmu cormpus info into listed list
	    iambic = cmudict.dict()                     # connect to cmu corpus, called iambic
	    big_list = []                               # list to collect all the different versions of words and their meter
	    for word in new_words:                      # get word from list of clean words
	        syl_num = sylco([word])
	        word_n_versions_list = []               # list has each word and the different versions
	        word_n_versions_list.append(word)       # add word
	        versions_list = []                      # list of all diff versions
	        try:                                    # if word is in corpus
	            for n,x in enumerate(iambic[word.lower()]): # get versions for each word
	                version = []                    # list for each version
	                version.append(word+str(n))     # add word+version
	                meter_list = []                 # list holds word version's meter
	                for y in x:                     # for word in cmu-dict sent
	                    for char in y:              # for character in word
	                        if char.isdigit() == True: # if the char is a number
	                            meter_list.append(int(char)) # add number to meter
	                version.append(meter_list)      # add meter to the word version
	                versions_list.append(version)   # add all the versions to one list
	            word_n_versions_list.append(versions_list) # add list of diff versions to word and versions list
	        except:                                 # if word isnt in corpus
	            version = []                        # empty version
	            version.append(word+str(0))         # add word1
	            meter_list = []                     # empty meter list
	            if len(syl_num) == 1:
	                for syl in range(syl_num[0]):          # for each syllable...
	                    meter_list.append(-1)           # add 0 to meter_list
	                version.append(meter_list)          # add empty meter list to version
	                versions_list.append(version)       # add version w/ word1 to versions list
	                word_n_versions_list.append(versions_list) # add list of diff versions to word and versions list
	                big_list.append(word_n_versions_list) # adds word and versions to big list
	    return big_list
Пример #9
    def on_load(self):
        print "Loading: " + self.__class__.__name__
        wd = self.context.getWorkingDir()
        nltk.data.path.append(wd + "nltk_data")

        self.d =  cmudict.dict()
Пример #10
def make_cmu_wordlist():
    Strip the CMU Pronunciation Dictionary of accent marks.

    Add '$' to the end of strings (for markov chain use).

    Pickle and dump to 'cmu.p'.
    d = cmudict.dict()
    pronunciation_list = d.values()

    edited_list = []
    for entry in pronunciation_list:
        for word in entry:
            edited_word = ["#"]
            for i in xrange(len(word)):
                #remove accent marks
            #Use '$' to mark the end of words

#    with open('wordlists/cmu.p', 'w') as outfile:
#        pickle.dump(edited_list, outfile)
    return edited_list
Пример #11
  def __init__(self):

    # generate n2w 
    self.n2w = gen_n2w()

    # syllable dict
    self.cmu = cmudict.dict()
Пример #12
def parse_sentence(sent, syl=partial(syllabify, English),
    sent = sent.strip()
    if not len(sent):
    tokens = list(filter(len, map(preprocess, sent.split())))
    phonemes = (map(syl, pron_dict[t]) for t in tokens)

    nsyllables = set()
    final_sounds = set()
    for words in product(*phonemes):
        if not len(words):

        # Count the number of syllables and extract the stress pattern.
        stress, syllables = zip(*((s[0], s[1:]) for w in words for s in w))

        # Compute the final sound.
        final_syllable = syllables[-1]
        if len(final_syllable[2]):
            final_sound = "_".join(map("_".join, final_syllable[1:]))
        elif len(final_syllable[0]):
            final_sound = "{0}_{1}".format(final_syllable[0][-1],
            final_sound = "_".join(final_syllable[1])

        # Update the possible versions for this sentence.
        final_sounds.add(final_sound + "_{0}".format(int(stress[-1] > 0)))

    return nsyllables, final_sounds, [tokens[-1]]
Пример #13
def fix_db():

    print "* Executing database FIX procedure..."

    # connect to db
    mongodb_url = os.getenv("OPENSHIFT_MONGODB_DB_URL")
    client = pym.MongoClient(mongodb_url)
    db = client["shalk"]
    coll = db["ngrams"]

    base_data_dir = os.getenv("OPENSHIFT_DATA_DIR")
    if not base_data_dir:
        base_data_dir = "../data/"

    # initialize cmu dict
    nltk.data.path = ["{0}nltk/".format(base_data_dir)]
    cdict = cmudict.dict()

    count = 0
    upcount = 0
    mod = 100

    # iterate over all docs that need fixing
    orlist = [
        {"syllables": {"$exists": False}},
        {"rand": {"$exists": False}},
        {"type": {"$exists": False}},
        {"rhyme": {"$exists": False}},
    ngrams = coll.find({"$or": orlist})
    total = ngrams.count()

    for ngram in ngrams:
        upngram = False
        lastword = get_last_word(ngram)

        if "syllables" not in ngram:
            upngram = True
            ngram["syllables"] = count_syllables(lastword, cdict)
        if "rand" not in ngram:
            upngram = True
            ngram["rand"] = random.random()
        if "rhyme" not in ngram:
            upngram = True
            ngram["rhyme"] = get_rhyme(lastword, cdict)

        if not upngram:
            count += 1

        update_ngram(ngram, db)

        upcount += 1
        count += 1
        if count % mod == 0:
            print "- {0} out of {1} analysed! Docs updated: {2}".format(count, total, upcount)

    print "* Database FIX procedure finished!"
Пример #14
def does_rhyme_unit_test():    
    dictionary = cmudict.dict()
    print does_rhyme('lol','bol',2,dictionary)  
    print does_rhyme('cat','dog',2,dictionary)
    print does_rhyme('cat','bat',2,dictionary)
    print does_rhyme('cat','tot',2,dictionary)
    print does_rhyme('cat','tot',2,dictionary)
    print does_rhyme('hello','yellow',2,dictionary)
Пример #15
def does_rhyme_unit_test():
    dictionary = cmudict.dict()
    print does_rhyme("lol", "bol", 2, dictionary)
    print does_rhyme("cat", "dog", 2, dictionary)
    print does_rhyme("cat", "bat", 2, dictionary)
    print does_rhyme("cat", "tot", 2, dictionary)
    print does_rhyme("cat", "tot", 2, dictionary)
    print does_rhyme("hello", "yellow", 2, dictionary)
Пример #16
def load_pronunciations(pronun_dictionary_name='cmudict', stress='unstressed'):
    """ note that we only support cmudict from nltk """
    if stress not in STRESS_OPTIONS:
        raise TypeError

    try: cmu = cmudict.dict()
    except LookupError, AttributeError:
        cmu = load_cmu_pickle()
Пример #17
 def __init__(self, wav_folder):
     self.phones = {}
     # Initialise pronunciation dictionary (always add entries for punctuation symbols)
     self.pron_dict = dict.fromkeys(['.', '?', '!'], 'double_sil')
     self.pron_dict[','] = 'sil'
     self.whole_dict = cmudict.dict()
Пример #18
def getMulti():
	print("rhymeToPros has "+str(len(rhymeToPros))+" items")
	print("pronunciationToWords has "+str(len(pronunciationToWords))+" items")
	rgs=[rhymeGroup(r,rhymeToPros,pronunciationToWords,syllabifier.syllabify) for r in rhymeToPros]
	multi=[r for r in rgs if (groupHasAtLeastOneDifference(r) and not(r.HasOneWord() or r.HasOnePronunciation()))]
	print("English has "+ str(len(multi))+" good rhyme groups\n")
	return multi
Пример #19
def num_syllables(word):
    d = cmudict.dict()
    if "-" in word:
        word2 = "".join(word.split("-"))
        if word2 in d:
            word = word2
            return sum([num_syllables(w) for w in word.split("-")])
    return list((len(list(y for y in x if isdigit(y[-1]))) for x in d[word.lower()]))[0]
def test_num_syllables():
	s = cmudict.dict()
	tests = ['animal', 'i', '0', 1]
	expected = [3, 1, 1, 1]
	results = []
	for t in tests:
		results.append(num_syllables(t, s))
	rval = 1.0 * sum([e == r for e, r in zip(expected, results)]) / len(expected)
	return rval
Пример #21
def set_up_globals(ono=True):
    global dictionary
    dictionary = cmudict.dict()
    global stressed
    stressed = "1"
    global unstressed
    unstressed = "0"
    if ono:
Пример #22
def main():
    '''Generates the tracery grammar for @my_cat_ebooks.'''
        format='%(asctime)s %(levelname)8s [%(name)s] %(message)s',

    log.info('Loading CMU pronounciation dictionary')
    global cmu_pronounciations
    cmu_pronounciations = cmudict.dict()

    fruits = load_corpus("foods/fruits.json")["fruits"]

    log.info('body parts')
    body_parts = load_corpus("humans/bodyParts.json")["bodyParts"]

    amazing = load_corpus("words/encouraging_words.json")["encouraging_words"]

    superstar = [ln for s in wordnet.synsets('superstar') for ln in s.lemma_names()]

    pronouns = [
        # TODO: reintroduce this, but it affects the conjugation of the occupation.
        #   they may not be an cleaner
        #   they cleanses exultantly
        # "[he:they][him:them][hes:they're]",

    grammar = {
        "atrociously": adjly("atrocious"),
        "watermelon": fruits,
        "seven": "two three four five six seven eight nine ten eleven twelve".split(),
        "arm": body_parts,
        "amazing": amazing,
        "guitar": instruments(),
        "superstar": superstar,
        "setPronouns": pronouns,
        "setOccupation": occupations(),
        "stanza": [
            for s, weight in stanza_weights.iteritems()
            for _ in xrange(weight)
        "origin": ["#[#setPronouns#][#setOccupation#]stanza#"],

    log.info('writing grammar')
    with open('grammar.json', 'w') as f:
        json.dump(fp=f, indent=2, obj=grammar, sort_keys=True)
Пример #23
def recover_file_to_db(datafile):

    filename = datafile.rsplit("/")[-1]
    print "* Recovering file [{0}] into db...".format(filename)

    # connect to db
    mongodb_url = os.getenv("OPENSHIFT_MONGODB_DB_URL")
    client = pym.MongoClient(mongodb_url)
    db = client["shalk"]
    coll = db["ngrams"]

    base_data_dir = os.getenv("OPENSHIFT_DATA_DIR")
    if not base_data_dir:
        base_data_dir = "../data/"

    # initialize cmu dict
    nltk.data.path = ["{0}nltk/".format(base_data_dir)]
    cdict = cmudict.dict()

    count = 0
    mod = 1000

    # open file in reverse, and import it until we find the point where we stopped
    ngrams = []
    for line in reversed(open(datafile).readlines()):
        ngram = get_ngram(line, cdict)

        if not ngram:

        # stop we find this ngram in the db already
        if find_one(ngram, db):
            # if `force`, we will iterate over all docs, but will ignore the ones that are already inserted
            if args.force:
                print "- ({0}) Ngram [{1}] already in the db, jumping to the next one...".format(filename, ngram)

            print "- ({0}) Ngram [{1}] already in the db, stopping the recovery!".format(filename, ngram)

        count += 1
        if count % mod == 0:
            print "- ({0}) Inserted [{1}] ngrams into db...".format(filename, len(ngrams) * (count / mod))
            print "- ({0}) {1} -> {2}".format(filename, ngrams[0], ngrams[-1])
            insert_ngrams(ngrams, db)
            ngrams = []

    print "- ({0}) Inserting last [{1}] ngrams into db...".format(filename, len(ngrams))
    insert_ngrams(ngrams, db)

    print "* Finished importing file [{0}]!".format(filename)
Пример #24
    def __init__(self, wav_folder):
        self.out = SA.Audio(rate=16000) # Create a blank audio for output, with a frequency of 16000

        self.phones = self.get_wavs(wav_folder) # Add wavs as audio objects for each phoneme
                                                # and additional elements for pause breaks

        self.add_phone_break('comma - break', 250)
        self.add_phone_break('sentence - break', 500)

        self.word_phones_dict = cmudict.dict()
Пример #25
 def __init__(self):
     self.dict = cmudict.dict()
     self.unknown_dict = {}
     for key in self.dict.keys():
         if "'" in key:
             self.unknown_dict[key.replace("'", '')] = key
         if key.endswith('ing'):
             self.unknown_dict[key.replace('ing', 'in')] = key
         if 'every' in key:
             self.unknown_dict[key.replace('every', 'evry')] = key
Пример #26
def SyllableCalculator(text):
    d = cmudict.dict()
    counter = 0.0
    tokens = re.findall("[A-Z]{2,}(?![a-z])|[A-Z][a-z]+(?=[A-Z])|[\'\w\-]+",text)
    for token in tokens:
        count = 1.0
        if token.lower() in d.keys():
            count =max([len(list(y for y in x if isdigit(y[-1]))) for x in d[token.lower()]])
        counter = counter+count
    return counter
Пример #27
    def __init__(self):
        """Instantiate a MinPairFinder.

        Note; To avoid unnecessarily repeating the work of loading the dict,
        call `get_instance` instead.
        if not self._dict:
            self._dict = cmudict.dict()
        if not self._rhymes_dict:
            self._rhymes_dict = self._get_rhymes_dict()
Пример #28
        def text_to_phoneme_2(text): # different format for speech
          phoneme_dict = cmudict.dict()
          text = ""
          for word in raw_english:
            syllable = phoneme_dict[word][0] # there should be a counter somewhere for each phonemic version
            syllable = '-'.join(syllable)
            text = text + syllable + "- -"

          text = "-" + text
          return text
Пример #29
    def get_rhymes(self, word):
        rhymes = []

        word_pronounciations = cmudict.dict()[word]
        for word_pronounciation in word_pronounciations:
            for rhyme, rhyme_pronounciation in cmudict.entries():
                if rhyme_pronounciation[-1] == word_pronounciation[-1]:

        return rhymes
Пример #30
def transcribeWord(word):
    dict = cmudict.dict()

    if word in dict:
        pronunciations = dict[word]
        syllables = pronunciations[0]
        pronunciation = ' '.join(syllables)
        return pronunciation
        return False
Пример #31
from haikuincidence.utils.haiku_utils import count_syllables, get_haiku
from haikuincidence.utils.text_utils import clean_text

# get data to use for dealing with tweets
track_str = get_track_str()
ignore_tweet_list = get_ignore_tweet_list()
syllable_dict = get_syllable_dict()
emoticons_list = get_emoticons_list()

# Use inflect to change digits to their English word equivalent
inflect_p = inflect.engine()
# Use the CMU dictionary to count syllables
pronounce_dict = cmudict.dict()

# guess_syl_method = "min"
guess_syl_method = "mean"
# guess_syl_method = "max"

def get_syllable_count_and_haiku(text):
    count = count_syllables(
Пример #32
import nltk
from nltk.corpus import cmudict
from nltk.tokenize import word_tokenize, wordpunct_tokenize, sent_tokenize
import numpy as np
import os.path
import pandas as pd
import pickle
import random
import re
import requests
import string
import sys

# dictionary to look up pronounciations
master_dict = cmudict.dict()

def save_obj(obj, fname):
    with open(fname, 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(fname):
    with open(fname, 'rb') as f:
        return pickle.load(f)

class Poem(object):

    def __init__(self, text, fname=None):
Пример #33
def get_syllables(word):
    d = cmudict.dict()
    return [
        len(list(y for y in x if y[-1].isdigit())) for x in d[word.lower()]
Пример #34
brown.categories()  # the categories of texts in the corpus
brown.fileids(categories="hobbies")  # text files in the category of "hobbies"

brown.raw()[:100]  # the raw text of the brown corpus (note tags)
brown.sents()[0]  # first sentence in the corpus
brown.words()[:10]  # first ten words in the corpus
    0]  # first sentence, each word tagged with part-of-speech info
brown.tagged_words()[:50]  # first fifty words, all tagged

##-- Specialized corpus: cmudict --##

##     The Carnegie Mellon University Pronouncing Dictionary
##     over 130,000 words, includes stress and variant pronunciations


### ~~~~~~~~~~~~~~~~~~~~~~~ ###
### 2. Tokenizing Sentences ###
### ~~~~~~~~~~~~~~~~~~~~~~~ ###
### - Breaking a sentence string into tokens (words, etc.)
### - A few common issues:
###     Punctuation
###     Contractions (e.g., can't)
###     Non-alphabetical words

sent = "I don't want a blueberry cake... I want a vanilla-almond cake!!!"
Пример #35
def starts_with_vowel_sound(word, pronunciations=cmudict.dict()):
    for syllables in pronunciations.get(word, []):
        return syllables[0][-1].isdigit()  # use only the first one
Пример #36
if not exists(NLTK_DATA_PATH):
    for datum in NLTK_DATA:

from nltk.stem.snowball import EnglishStemmer
import nltk.chunk as chunk
from nltk.corpus import cmudict

DIVIDER_TAG = ':'  # nltk uses this to tag for ; and :

# Set up some state that we'll use in the functions throughout this file:
# TODO consider making a class that has modular stemmer/tokenizer
stemmer = EnglishStemmer()
tokenizer = nltk.data.load("tokenizers/punkt/english.pickle")
cmudict_dict = cmudict.dict()

# Some useful regexes:
vowel_re = re.compile("[aeiouAEIOU]")
vowel_phoneme_re = re.compile("AA|AE|AH|AO|AW|AY|EH|EY|ER|IH|IY|OW|OY|UH|UW")
consonant_phoneme_re = re.compile(

# Helper predicates:
is_vowel = partial(match, vowel_re)
is_vowel_phoneme = partial(match, vowel_phoneme_re)
is_consonant_phoneme = partial(match, consonant_phoneme_re)

def word_to_phonemes(word):
    result = cmudict_dict.get(word.lower(), None)
Пример #37
from nltk.corpus import cmudict

from HMM import unsupervised_HMM
from helper import *

# text = open(os.path.join(os.getcwd(), 'data/shakespeare.txt')).read()
text = open(os.path.join(os.getcwd(), 'data/allpoems.txt')).read()
# visualization of whole data set
wordcloud = text_to_wordcloud(text, title='Shakespeare')
# TODO: extract words
# - keep hyphenated words hyphenated
# - some words could be tokenized as bigrams
# - separate punctuation from words, and store them separately
obs, obs_map = parse_observations(text)
syllables = cmudict.dict()
for punct in [".", ",", ":", ";", "!", "?"]:

#Was 20
hmm8 = unsupervised_HMM(obs, 10, 100)

# visualizations of sparsity of A, O as well as
# visualizations of states as wordclouds
visualize_sparsities(hmm8, O_max_cols=50)
wordclouds = states_to_wordclouds(hmm8, obs_map)

#This part only works in Jupyter Notebook
anim = animate_emission(hmm8, obs_map, M=8)
Пример #38
def cut_tweet_to_syllables_unit_test():
    dictionary = cmudict.dict()
    print cut_tweet_to_syllables(
        'damn n***a look at all these syllables tho for real', 10, dictionary)
Пример #39
import nltk
from nltk.corpus import brown
from nltk.corpus import wordnet as wn
from nltk.corpus import cmudict
from nltk.stem import WordNetLemmatizer
import csv

#initialization needed for using WordNetLemmatizer and cmudict
stemmer = nltk.PorterStemmer()
pronunciations = cmudict.dict()
lemma = WordNetLemmatizer()

#return true if the word has more than one meaning
def is_homo(word):
    if (len(wn.synsets(word)) > 1):
        return True
    return False

#return true if the word has more than one pronunciation
def is_hetero(word):
    if (word in pronunciations.keys()):
        if len(pronunciations[word]) > 1:
            return True
    return False

#return true if given word is a verb with no meaning such as be, do, have
def is_general(word):
    if (word == "was" or word == "been" or word == "are" or word == "did"
Пример #40
class AmericanEnglishLangContext(LanguageBoundsInterface):
    """Defines the properties and implementation of standard American English."""

    ########## Variables ##########

    _cmu = cmudict.dict(
    )  # Pretrained phenome generation model. Created outside of methods because it is used over iteration(s) and is expensive to generate; TREAT THIS VALUE AS AN IMMUTABLE.
    _MULTI_TOKEN_INDICATOR = "_"  # Character used to identify when a token has multiple words. This functionality is specific to a corpus. Must be changed if corpus is changed.
    _NULL_PHENOME_INDICATOR = "*NONE*"  # Used by algorithm to indicate if a corressponding phemone could not be found for a token
    _SIMILARITY_THRESHOLD = 0.2  # The threshold that must be passed for a word to be considered similar. Scaled from 0-1.
    vowelphenomes = [
        "AA", "AE", "AH", "AO", "AW", "AY", "AX", "AXR", "EH", "ER", "EY",
        "IH", "IX", "IY", "OW", "OY", "UH", "UW", "UX"
    ]  # Contains all phenomes that produce vowel-related sounds for this language.


    def _getproperformattype(self, unformattoken):
        """Used to parse through the Wordnet sysnet-token return value to retrieve only relevant sections. Currently the only returns the word.
        In future implementations, this function may not be needed if the corpus has a function to return only the word as a string."""

        name, junk = unformattoken.name().split(".", 1)
        return name

    def _getproperhandlemissingphenome(self, unknowntoken):
        """Takes a unknown-phenome (a token which could not be evaluated by CMUdict) and attempts to generate a phenome. If CMUdict or
        Wordnet implementation is changed this function MUST be changed."""

        finaleval = []

        # After various testing, it has been determined that calculating for two letters yields the most consistent results for unknown phenomes.
        tokenlen = len(unknowntoken)
        if tokenlen is 0:
        elif tokenlen is 1:
                              ])  # The letter IS the phenome
            relevant = unknowntoken[:2]  # get first two chars
            finalattempt = self._cmu.get(relevant, None)

            if finalattempt is None:  # No possible phenome can be generated by this algorithm
            elif finalattempt is list:
            else:  # 'finalattempt' is guareenteed to only be of type NONE, list, or list[list].
                )  # flatten list; tis step is necessary to maintain parsability

        return finaleval

    def _getproperhandlemultitoken(self, multitoken):
        """Takes a multi-word (a token with words seperated by '_' by Wordnet) and breaks it down into a format that can be evaluated by the CMUdict. If CMUdict or
        Wordnet implementation is changed this function MUST be changed."""

        finaleval = []
        individualtokens = multitoken.split(self._MULTI_TOKEN_INDICATOR)

        for token in individualtokens:  # evaluate each token phenome indiviually; then represent multitoken for EACH phenome calculated, when returned to scanning.
            phenome = self._cmu.get(token.lower(), None)

            if phenome is list:

            else:  # 'phenome' is guareenteed to only be of type NONE, list, or list[list].
                if phenome is None:
                    phenome = self._getproperhandlemissingphenome(token)

                )  # flatten list; this step is necessary to maintain parsability

        return finaleval

    def getphenomes(self, arg):
        """Returns all phenome-lists related to the token. ('context' is the representation of the phrase in collection form.)"""

        # uses CMUdict as the core processing algorithm. If CMUdict fails to find a match the function will predict a possible phenome for the token.
        # This function is guareenteed to return a value.

        generatephenome = self._cmu.get(
            arg.lower(), None
        )  # _cmu is defined globally above in "VARIABLES" section. Treat as an immutable.
        if generatephenome is None:
            if arg.__contains__(
            ):  # _MULTI_TOKEN_INDICATOR is defined globally above in "VARIABLES" section. Treat as an immutable.
                generatephenome = self._getproperhandlemultitoken(arg)

            else:  # token is unknown by CMUdict
                generatephenome = self._getproperhandlemissingphenome(arg)

        # When multiple phenomes exist for same word, a list[list[str]] is generated
        return generatephenome

    def hypernyms(self, context, arg):
        """Returns all hypernyms related to the token. ('context' is the representation of the phrase in collection form.)"""

        # This function assumes the use of Wordnet. If Wordnet implementation changes, this function MUST change.

        eval = None
        interpretation = lesk(context, arg)
        if interpretation is not None:
            eval = map(self._getproperformattype, interpretation.hypernyms())

        return eval

    def hyponyms(self, context, arg):
        """Returns all hyponyms related to the token."""

        # This function assumes the use of Wordnet. If Wordnet implementation changes, this function MUST change.

        eval = None
        interpretation = lesk(context, arg)
        if interpretation is not None:
            eval = map(self._getproperformattype, interpretation.hyponyms())

        return eval

    def messagefail(self, input):
        """Produces the fail message to print to users in this language if the process cannot return a value."""
        built = " ".join(input)
        return (
            "Your input: '" + built +
            "' was not able to be parsed under the conditions you desired. Please try new conditions or try a new phrase."

    def messageonlyresult(self, arg):
        """Produces a indicator message if only one result was possible from the input parameters given."""
        return ("This is the only result processed from the given input:\n" +

    def messagetopresult(self, resultlen, requestedresultcount):
        """Produces the top 'x' results message to users in this language if the process has multiple results."""
        if resultlen < requestedresultcount:
            return ("Top " + str(resultlen) + " result(s):\n")
            return ("Top " + str(requestedresultcount) + " result(s):\n")

    def similarity(self, contextclues, arg1, arg2):
        """Returns a key-value pair for scoring similarity. [0] a bool that determines if the word is similar enough to satisfy language criteria
        and the score associated with the evaluation."""

        # This function assumes the use of Wordnet. If Wordnet implementation changes, this function MUST change.

        evaluation = False
        score = 0

        if arg1 is arg2:
            evaluation = True
            score = self._SIMILARITY_THRESHOLD  # Penalizing score to prevent paraphrases from returning themselves

            contextA = lesk(contextclues, arg1)
            contextB = lesk(contextclues, arg2)

            if contextA and contextB:  # Otherwise score will stay zero
                score = contextA.path_similarity(contextB)

                if score is not None and self._SIMILARITY_THRESHOLD <= score:
                    evaluation = True

        return (evaluation, score)

    def split(self, arg):
        # Returns all non-whitespace tokens.
        return RegexpTokenizer('\w+|\$[\d\.]+|\S+').tokenize(arg)
Пример #41
 def __init__(self):
     self.cmudict = cmudict.dict()
Пример #42
def do_sentences_rhyme_unit_test():
    dictionary = cmudict.dict()
    print do_sentences_rhyme('oh hello', 'no yellow', dictionary)
    print do_sentences_rhyme('so the dog', 'log', dictionary)
    print do_sentences_rhyme('potato', 'wefo', dictionary)
    print do_sentences_rhyme('hog', 'log', dictionary)
        for i, tag in enumerate(tag_list):
            spelling, sense, pron = tag
            info = '(' + str(sense) + ',' + pron + ')'
            loc = sent.find(spelling, current_loc)
            sent = sent[:loc + len(spelling)] + info + sent[loc +
            current_loc = loc + len(spelling) + len(info)
        format_output['sentence'][row_idx] = sent
        print(sent, '--- Source: ', row['citation'])

    format_output.to_csv(filename, index=False, header=0)

## Set up basic corpora
pron_dict = cmudict.dict()
brown_words = brown.tagged_words(tagset='universal')
treebank_words = treebank.tagged_words(tagset='universal')
nps_words = nps_chat.tagged_words(tagset='universal')
corpus = brown_words + treebank_words + nps_words
corpus = [(word.lower(), tag) for (word, tag) in corpus]
stopset = set(stopwords.words('english'))
## Set up pretrained spaCy's word vector
nlp = spacy.load('en_core_web_lg')

## Collect potential heteronyms
data = get_het_from_corpus(corpus)

## Assign Wiktionary data to the potential heteronyms
parser = init_wikparser()
data = get_pronunciation(parser, data)
Пример #44
Classes and utilities for extracting haiku from arbitrary text and evaluating them based on some programmatically
defined criteria
import nltk
import string
from nltk.corpus import cmudict
from nltk_util import syllables_en
from haikus.evaluators import DEFAULT_HAIKU_EVALUATORS

global WORD_DICT
    WORD_DICT = cmudict.dict()
except LookupError:
    WORD_DICT = cmudict.dict()

class NonwordError(Exception):

class HaikuText(object):
    A wrapper around some sequence of text
    def __init__(self, text=None):
        self._text = text

    def get_text(self):
        return self._text
Пример #45
def unit_test_count_syllables_sentence():
    dictionary = cmudict.dict()
    print count_syllables_sentence('hello please check my syllables',
    print count_syllables_sentence('checking some syllables right now dog',
Пример #46
from __future__ import print_function
from __future__ import division
from scipy.integrate import quad
import random
import numpy as np
import codecs
import string
import re
import cPickle as cp
from nltk import pos_tag
from nltk import word_tokenize
import collections

from nltk.corpus import cmudict

d = cmudict.dict()  # dicionary of syllables from cmudict

def check_unique(unique):
    out = open('unique.txt', 'w')

    for i in unique:
        out.write(i + '\n')

# parses the text file 'shakespeare.txt' and adds each unique word to a dictionary,
# WORD_DIC, with a unique index
def parse(word_dic, index_dic):

    # open 'shakespeare.txt'
    'date_of_publication', 'num_of_words', 'num_of_non_empty_lines',
    'num_of_verses', 'avg_word_len', 'avg_line_len', 'avg_lines_per_verse',
    'longest_line', 'words_per_line', 'largest_word',
    'poem_stress_list_no_punct', 'chars_per_line'
# Load JSON

with open(DATA_DIR + READ_JSON_FILE, 'r') as infh:

    cnt = 0
    no_lines = 0

    largest_word_corpus_ls = []
    prondict = cmudict.dict()

    # for every poem-file-object
    for data in import_utilities.json_parse(infh):
        # process object
        cnt = cnt + 1
        #print "cnt:", cnt
        labels_ls = []

        author = 'UNKNOWN'
        title = 'UNKNOWN'

        # get the data out of json
        for idx, val in enumerate(data):

            #print idx, val
Пример #48
from nltk.corpus import cmudict
from nltk.tokenize import RegexpTokenizer
import os.path, time
import datetime
import common
import rssNewsFetcher
import pickle

d = cmudict.dict()  # get the CMU Pronouncing Dict
phrasetokenizer = RegexpTokenizer(r"[\w| |\-|\'|\‘|\’|\$]+")
wordtokenizer = RegexpTokenizer(r"[\w+|\']+")
soundtokenizer = RegexpTokenizer(r"[A-Z]+")

def hasNumbers(inputString):
    return any(char.isdigit() for char in inputString)

def nsyl(word):
    """return the max syllable count in the case of multiple pronunciations"""
    lastsound = ''
    syllables = 0
        if isinstance(d[word.lower()], list):
            word = d[word.lower()][0]
        for sound in word:
            if hasNumbers(sound):
                syllables = syllables + 1
                lastsound = ''
            #append last sound
            lastsound += soundtokenizer.tokenize(sound)[0]
Пример #49
kyle_tokens = kyle_quotes_lower.apply(nltk.word_tokenize)

kyle_tokens_list = [
    word for inner_list in list(kyle_tokens) for word in inner_list
kyle_tokens_list = [
    re.sub(r'[^A-Za-z0-9\'\-{1}]+$|\'$', 'punc', i) for i in kyle_tokens_list
kyle_lexical_diversity = len(set(kyle_tokens_list)) / len(kyle_tokens_list)

top_characters = quotes_by_character.count()[
    quotes_by_character.count().Line > 1000].index
pro_dict = cmudict.dict()

def get_character_params(data, character):

    character_quotes = data[data.Character == character].Line
    character_quotes_lower = character_quotes.apply(str.lower).apply(
        str.rstrip, '\n')
    character_tokens = character_quotes_lower.apply(nltk.word_tokenize)
    character_tokens_list = [
        word for inner_list in list(character_tokens) for word in inner_list
    character_tokens_list = [
        re.sub(r'[^A-Za-z0-9\'\-{1}]+$|\'$', 'punc', i)
        for i in character_tokens_list
Пример #50
parser = argparse.ArgumentParser()
                    help='provide a word to find its rhymes')
                    help='provide a word/sentence to see its phonemes')
args = parser.parse_args()

l_ents = cmudict.entries()  # "list" of entries
d_ents = cmudict.dict()  # "dict" of entries

# if they are using command line args, single usage mode
if len(argv) > 1:

    if args.rhyme:

    if args.phones:

# interactive mode with repeating menu and options
    inp = ''
    while inp != 'q':
        print('\n(1) Find phonemes\n(2) Find rhyming words\n(q) Quit\n')
Пример #51
import util


MODULE_NAME = 'termset_expander.py'

global DEBUG
DEBUG = False

# load Spacy's English model
nlp = spacy.load('en_core_web_sm')

# initialize the CMU phoneme dictionary
cmu_dict = cmudict.dict()

# regexes for locating termsets in NLPQL files

# line comment - match everything from // up to but NOT including the newline
# also, don't match the // in a URL
str_line_comment = r'(?<!http:)(?<!https:)//.*(?=\n)'
regex_line_comment = re.compile(str_line_comment, re.IGNORECASE)

# multiline comment
str_multiline_comment = r'/\*.*?\*/'
regex_multiline_comment = re.compile(str_multiline_comment,
                                     re.IGNORECASE | re.DOTALL)

# a term is anything enclosed in double quotes
str_term = r'\"[^"]+\"'
Пример #52
import Models
from nltk.corpus import cmudict
""" Global variables for the reading level project.

This module holds the global variables for the project.

    Charles Billingsley
    Josh Getter
    Adam Stewart
    Josh Techentin


# Main Globals
dictionary = cmudict.dict()
input_file = ''
file_content = ''
current_line_number = 0
full_input = ''
total_words = 0
total_sentences = 0
total_syllables = 0
target_reading_level = ''
shouldModify = False

# ChangeLevel Globals
target_reading_level = ''
target_reading_score = Models.ReadingScoreRange()
Пример #53
def __syllables__(word):
    print "Doing syllables lookup for", word
    d = cmudict.dict()
    if word == '':
        return 0
    return [len(list(y for y in x if isdigit(y[-1]))) for x in d[word]][0]
Пример #54
import re
import inflect
import hyphenator
from nltk.corpus import cmudict

d = cmudict.dict()  #probably will need this
p = inflect.engine()

#this takes a word and separates its syllables so that they are hyphenated
#also capitalized
#if it takes a hyphenated word it should just return the hyphenated word (i have implemented this)
def hyphenate_word(word):
    word = '-'.join(hyphenator.hyphenate_word(word))
    if '--' in word:
        word = '-'.join(word.split('--'))
    return word

#returns a hyphenated version
def hyphenate_phrase(phrase):
    words = phrase.split(" ")
    returnme = []
    for word in words:
    print(" ".join(returnme))

def word_syllable_count(word):
    return hyphenate_word(word).count('-') + 1
Пример #55
#-*- coding: utf-8 -*-

# Tools for working with poems
# Licensed under GPLv2 or later.

from __future__ import print_function
import json, os, re, sys
from collections import defaultdict
from string import ascii_lowercase
from Levenshtein import distance
from .countsyl import count_syllables

    from nltk.corpus import cmudict
    cmu = cmudict.dict()
    with open(os.path.join(os.path.dirname(__file__), 'cmudict/cmudict.json')) as json_file:
        cmu = json.load(json_file)

def elided_d(word):
    if word[-2:] == "'d":
        return word[:-2] + "ed"
    return word

def tokenize(poem):
    tokens = []
    for line in poem.split('\n'):
        line       = line.replace('-', ' ') # need to find a better tokenizer, but this works for now
        no_hyphens = line.replace('—', ' ') 
        cleaned    = re.sub(r'[^0-9a-zA-Z\s\']', '', no_hyphens) # keep apostrophes
Пример #56
from collections import Counter
from nltk.corpus import words             #check dictionary
from nltk import pos_tag as posTag
import emoji #pip install
import re                           #elongation
from autocorrect import spell    #pip install     #check spelling
from nltk.tokenize import sent_tokenize #sentence tokenizer         https://www.nltk.org/api/nltk.tokenize.html also see
import csv                              #read file    
from datetime import datetime           #convert unix time to human time
from nltk.tokenize import RegexpTokenizer       #remove puncutations
from nltk import edit_distance as ed    #check word spelling correction distance
import urllib.request as urllib         #ud convert url to unicode
punctuations = RegexpTokenizer(r'\w+')
from nltk.corpus import cmudict
import math
CMUdict = cmudict.dict()      #syllable

class preProcess(object):
    def __init__(self):
        loads urban dictionary and emoji list
    def chanCleaner(self,post):          #clean 4archive posts
        cleans 4chan posts by removing the initial disclaimer
Пример #57
import nltk

from nltk.corpus import cmudict
import numpy as np

d = cmudict.dict()

def syllable_count(word):
        return np.min([
            len(list(y for y in x if y[-1].isdigit())) for x in d[word.lower()]
    except KeyError:
        #if word not found in cmudict
        return _syllables(word)

def _syllables(word):
    #referred from stackoverflow.com/questions/14541303/count-the-number-of-syllables-in-a-word
    count = 0
    vowels = 'aeiouy'
    word = word.lower()
    if word[0] in vowels:
        count += 1
    for index in range(1, len(word)):
        if word[index] in vowels and word[index - 1] not in vowels:
            count += 1
    if word.endswith('e'):
        count -= 1
Пример #58
from nltk.corpus import cmudict
from pattern.en import parse, parsetree, wordnet, NOUN, pluralize
from BasicModels import Error
import os
import settings
import logging

LOGGER = logging.getLogger("pattern.server")

PRON = cmudict.dict()
AEIOU = ['A', 'E', 'I', 'O', 'U']

#countabl features from celex
def readNounList(fileName):
    nounList = open(fileName, "r")
    raw = nounList.read().splitlines()
    maps = dict()
    for line in raw:
        data = line.strip().split("\t")
        key = data[0]
        cop = data[1]
        if len(data) != 14:
            print "Read list wrong!"
        if maps.has_key(key):
            tmp = maps.get(key)
            if cop > tmp:
                maps[key] = data[1:]
import re

import numpy as np
import pandas as pd
# cmudict的entries方法找出所有音素
import nltk
from nltk.corpus import stopwords  # 这个stopwords.words("english")
from nltk.corpus import cmudict
# import scikit-learn里面的两个计算tf-idf必要的类
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

# In[618]:

# 用cmudcit的hashmap版本,查询会更快
phonetic_check_dict = cmudict.dict()

# In[619]:

# cmudict.entries是一个list,里面的每个elements是一个tuple,tuple[0]是单词或字母,tuple[1]是对应的音素

# In[620]:

def get_data(filename):
    data_origin = pd.read_csv(filename, encoding='utf-8')
    return data_origin

# In[621]:
class SG:
    normalized_words = []
    pronunciation_tokens = []
    post_prosody = []
    cmud = cmudict.dict()
    sound_dict = sound_dict_generator.Synth().diphones

    #def __init__(self):
        #self.normalized_words = ['<beginning>', '<question>', 'hello', 'there', 'professor', '<break,comma,1>', 'how', 'are',
        #                         'you', 'doing', '<break,question,2>', 'i', 'am', 'good', '<break,sent_end,2>',
        #                         '<exclamation>', 'This', 'is', 'so', 'amazing', '<break,exclamation,2>', '<end>']
        # self.normalized_words = ['doctor', 'rabbits', 'email', 'is', 'i', 'l', 'u', 'v', 'c', 'a', 'r' 'r',
        # 'o' 't' 's', 'three', 'zero', 'five', 'at', 'g', 'mail', 'dot', 'c', 'o', 'm', '<break,sent_end,2>', 'you',
        # 'can', 'checkout', 'his', 'website', '<break,comma,1>', 'r', 'a', 'b', 'b', 'i', 't', 'd', 'r', 'dot', 'g',
        # 'o', 'v', '<break,sent_end,2>', 'he', 'uses', 'forty', 'milliliters', 'beakers', 'to', 'find', 'tilde',
        # 'volume', '<break,sent_end,2>', 'he', 'has', '<currency>', 'negative', 'three', 'dollars', 'in', 'his',
        # 'bank', 'account', '<break,sent_end,2>']

    def __init__(self, n_w: list):
        self.normalized_words = n_w

    def text_to_phoneme(self):
        skip = 0
        for w in self.normalized_words:  # get the token from normalized_words
            if w in self.cmud:
                phone = self.cmud[w][0]  # convert tokens to its phoneme form
                for i in range(len(phone)):
                    phone[i] = re.sub("[^a-zA-Z\\s\-]", "", phone[i]).lower()
                self.pronunciation_tokens.append(phone)  # add the phoneme form of the word to pronunciation_tokens
            elif w[0] == '<' and w[-1] == '>':
                for i in range(len(w)):
                    if skip > 0:
                        skip -= 1
                        phone = self.cmud[w[i:i+5].lower()][0]
                        skip = 4
                            phone = self.cmud[w[i:i+4].lower()][0]
                            skip = 3
                                phone = self.cmud[w[i:i+3].lower()][0]
                                skip = 2
                                    phone = self.cmud[w[i:i+2].lower()][0]
                                    skip = 1
                                        phone = self.cmud[w[i].lower()][0]
                    for i in range(len(phone)):
                        phone[i] = re.sub("[^a-zA-Z\\s\-]", "", phone[i]).lower()
                # TODO: figure out what to do with words not in the cmu dictonary
                #       Possibilities: should we get the root?, use the google converter?

    def prosody_analyzer(self):
        temp = []
        for w in self.pronunciation_tokens:
            if w[0] == "<beginning>" or w[0] == "<end>":
                if w[0] == "<end>":
            elif w[0] == "<break,comma,1>":
            elif w[0] == "<break,semicolon,1.5>" or w[0] == "<break,colon,1.5>":
            elif w[0] == "<break,sent_end,2>" or w[0] == "<break,question,2>" or w[0] == "<break,exclamation,2>":
            elif w[0] == "<question>" or w[0] == "<exclamation>":
            elif w[0] == "<space>":
                for p in w:
        for i in range(len(temp)):
            if temp[i] == "<exclamation>" or temp[i] == "<question>" or temp[i] == "<break,sent_end,2>" or temp[i] == "<break,question,2>" or temp[i] == "<break,exclamation,2>":
            if i != len(temp)-1:
                if temp[i+1] == "<exclamation>" or temp[i+1] == "<question>" or temp[i+1] == "<break,sent_end,2>" or temp[i+1] == "<break,question,2>" or temp[i+1] == "<break,exclamation,2>" or temp[i+1] == "<end>":
                    if temp[i+1] == "<end>":
                        self.post_prosody.append(temp[i] + '-' + temp[i + 2])
                    self.post_prosody.append(temp[i] + '-' + temp[i+1])