def __init__(self, dataset="picasso2", basedir="parsed_data"): self.dataset = dataset self.basedir = basedir filename = "%s/%s/source/%s" % (basedir,dataset,dataset) self.debug("poemparser:init:dataset parsing '%s'..." % filename) with open("pyParser/english_words.txt") as word_file: self.english_words = set(word.strip().lower() for word in word_file) # Open and analyze the text data. self.unknownWords = {} self.iffyWords = {} self.allmatch = {} self.alltokens = self.openTokens(filename) self.parsedTokens = [token for token in self.alltokens[0] if token != '-'] self.replacedTokens = [token for token in self.alltokens[1] if token != '-'] self.fullTokens = [token for token in self.alltokens[2] if token != '-'] self.tokens = self.parsedTokens self.loweredTokens = [token.lower() for token in self.replacedTokens] self.pos_tags = nltk.pos_tag(self.replacedTokens) self.text = nltk.Text(self.tokens) self.dict = cmudict.dict() self.lastspeed = 0 self.midiindex = 0 self.setMIDISettings(12) self.debug("poemparser:init:words %s" % self.fullTokens) self.debug("poemparser:init:tokens %s" % self.tokens) self.debug("poemparser:init:text %s" % self.text)
def reset_country_codes_to_emoflags(cc_path='country_codes.txt', irange=ET.FLAGS_RANGE, charset='utf-8'): ''' Using a country code dict, set the name and syllable fields in a copy of emo_tuples. ''' cmu_prons = cmudict.dict() # get the CMU Pronouncing Dict cc_dict = load_country_codes(cc_path) for tup in ET.EMO_TUPLES[irange.start:irange.stop]: cc2 = tup[ET.INDEX_ALTERNATIVES][0].strip(':').upper() # print(cc2, ' ', end='') monos, polys, names = [], [], [cc2] names.extend(nm for nm in tup[ET.INDEX_POLYSYLLABLES] if len(nm) > 2) try: names.extend(cc_dict[cc2]) # print(names, file=sys.stderr) except KeyError: print("{} missing {}\n\tusing: {}".format( cc2, tup, names), file=sys.stderr) for name in set(names): if sylc.syl_count(cmu_prons, name) == 1: monos.append(name) else: polys.append(name) tupal = list(tup) tupal[ET.INDEX_WORDSYLLABLES] = monos tupal[ET.INDEX_POLYSYLLABLES] = polys ret = tuple(tupal) print(" {},".format(ret), file=sys.stdout) # tupal[ET.INDEX_WORDSYLLABLES] = print()
def stress(self,bysentence=False): """ tokenizes (I guess) the words in self.text by the stress pattern in each of the words. """ vowels = ['A','E','I','O','U'] possible_stresses = ['1','2','0'] totaldic = cmudict.dict() def gen_stress(stripped_text): stress_list = [] for word in stripped_text.lower().split(): try: stress = str() phonemized = totaldic[word][0] for phoneme in phonemized: for stresser in possible_stresses: if stresser in phoneme: stress += stresser for index, sound in enumerate(phonemized[len(phonemized)-2:len(phonemized)]): for vowel in vowels: if vowel in sound: stress_list.append([word,stress,[index, sound],phonemized,len(phonemized)]) except KeyError: # print("{} couldn't be found".format(word)) pass return stress_list if bysentence: sentences = PunktSentenceTokenizer().tokenize(master_str) stress_by_sentence = [sentence.translate(string.maketrans("",""), string.punctuation) for sentence in sentences] return [gen_stress(sentence) for sentence in stress_by_sentence] elif not bysentence: stress_total = self.text.translate(string.maketrans("",""), string.punctuation) return gen_stress(stress_total)
def __compliant_haiku(self, haiku_source): """Ensure that newlines remain and all other punctuation has been stripped""" """Ensure that newlines remain and all other punctuation has been stripped""" dict = cmudict.dict() haiku_lines = haiku_source.splitlines() syllables = [] for line in haiku_lines: if line == "": continue sal=[] for word in line.split(" "): sal.append(len([x for x in dict[word][0] if x[-1].isdigit()])) syllables.append(sum(sal)) pattern = [5,7,5] if len(syllables) % 3 == 0: while len(syllables) > 0: if syllables[:3] == pattern: for x in range(2,-1,-1): syllables.pop(x) else: return False else: return False return True
def __init__(self,text): # Initialize vars self.sent_count = 0 self.word_count = 0 self.syll_count = 0 self.cmu = cmudict.dict() self.processText(text)
def approx_nsyl(word): """Credit - Jason Sundram, http://runningwithdata.com/post/3576752158/w Return the max syllable count in the case of multiple pronunciations""" d = cmudict.dict() if word not in d.keys(): return 0 return max([len([y for y in x if y[-1].isdigit()]) for x in d[word.lower()]])
def group_rhyming_tweets(filtered_tweet_list): """groups rhyming tweets into lists, then returns a list containing those lists. lists are sorted so that the list with the most rhyming words is first in the list.""" copy_filtered_tweet_list = list(filtered_tweet_list) dictionary = cmudict.dict() grouped_rhyming_tweets = [] index = 0 while ( index < len(copy_filtered_tweet_list) - 1 ): # don't need to check last element for rhymes against other words b/c all pairs of words checked already by that point rhyme_list = [copy_filtered_tweet_list[index]] i = index + 1 while i < len(copy_filtered_tweet_list): if ( do_sentences_rhyme(copy_filtered_tweet_list[index], copy_filtered_tweet_list[i], dictionary) or sentence_rhyme_score(copy_filtered_tweet_list[index], copy_filtered_tweet_list[i]) > 4 ): rhyme_list.append(copy_filtered_tweet_list[i]) copy_filtered_tweet_list.pop(i) i = i - 1 i = i + 1 rhyme_list = list(set(rhyme_list)) # remove non-unique entries by converting to a set and back again grouped_rhyming_tweets.append(rhyme_list) index = index + 1 # grouped_rhyming_tweets = sorted(grouped_rhyming_tweets, key = len, reverse = True) grouped_rhyming_tweets = [i for i in grouped_rhyming_tweets if len(i) > 1] return grouped_rhyming_tweets
def compile_meter_list(self, new_words, verbose=True): # simplifies and compiles cmu cormpus info into listed list iambic = cmudict.dict() # connect to cmu corpus, called iambic big_list = [] # list to collect all the different versions of words and their meter for word in new_words: # get word from list of clean words syl_num = sylco([word]) word_n_versions_list = [] # list has each word and the different versions word_n_versions_list.append(word) # add word versions_list = [] # list of all diff versions try: # if word is in corpus for n,x in enumerate(iambic[word.lower()]): # get versions for each word version = [] # list for each version version.append(word+str(n)) # add word+version meter_list = [] # list holds word version's meter for y in x: # for word in cmu-dict sent for char in y: # for character in word if char.isdigit() == True: # if the char is a number meter_list.append(int(char)) # add number to meter version.append(meter_list) # add meter to the word version versions_list.append(version) # add all the versions to one list word_n_versions_list.append(versions_list) # add list of diff versions to word and versions list big_list.append(word_n_versions_list) except: # if word isnt in corpus version = [] # empty version version.append(word+str(0)) # add word1 meter_list = [] # empty meter list if len(syl_num) == 1: for syl in range(syl_num[0]): # for each syllable... meter_list.append(-1) # add 0 to meter_list version.append(meter_list) # add empty meter list to version versions_list.append(version) # add version w/ word1 to versions list word_n_versions_list.append(versions_list) # add list of diff versions to word and versions list big_list.append(word_n_versions_list) # adds word and versions to big list return big_list
def on_load(self): print "Loading: " + self.__class__.__name__ wd = self.context.getWorkingDir() nltk.data.path.append(wd + "nltk_data") self.d = cmudict.dict() pass
def make_cmu_wordlist(): """ Strip the CMU Pronunciation Dictionary of accent marks. Add '$' to the end of strings (for markov chain use). Pickle and dump to 'cmu.p'. """ d = cmudict.dict() pronunciation_list = d.values() edited_list = [] for entry in pronunciation_list: for word in entry: edited_word = ["#"] for i in xrange(len(word)): #remove accent marks edited_word.append(word[i].rstrip('0123456789')) #Use '$' to mark the end of words edited_word.append('$') edited_list.append(edited_word) # with open('wordlists/cmu.p', 'w') as outfile: # pickle.dump(edited_list, outfile) return edited_list
def __init__(self): # generate n2w self.n2w = gen_n2w() # syllable dict self.cmu = cmudict.dict()
def parse_sentence(sent, syl=partial(syllabify, English), pron_dict=cmudict.dict()): sent = sent.strip() if not len(sent): return tokens = list(filter(len, map(preprocess, sent.split()))) phonemes = (map(syl, pron_dict[t]) for t in tokens) nsyllables = set() final_sounds = set() for words in product(*phonemes): if not len(words): return # Count the number of syllables and extract the stress pattern. stress, syllables = zip(*((s[0], s[1:]) for w in words for s in w)) # Compute the final sound. final_syllable = syllables[-1] if len(final_syllable[2]): final_sound = "_".join(map("_".join, final_syllable[1:])) elif len(final_syllable[0]): final_sound = "{0}_{1}".format(final_syllable[0][-1], "_".join(final_syllable[1])) else: final_sound = "_".join(final_syllable[1]) # Update the possible versions for this sentence. nsyllables.add(len(stress)) final_sounds.add(final_sound + "_{0}".format(int(stress[-1] > 0))) return nsyllables, final_sounds, [tokens[-1]]
def fix_db(): print "* Executing database FIX procedure..." # connect to db mongodb_url = os.getenv("OPENSHIFT_MONGODB_DB_URL") client = pym.MongoClient(mongodb_url) db = client["shalk"] coll = db["ngrams"] base_data_dir = os.getenv("OPENSHIFT_DATA_DIR") if not base_data_dir: base_data_dir = "../data/" # initialize cmu dict nltk.data.path = ["{0}nltk/".format(base_data_dir)] cdict = cmudict.dict() count = 0 upcount = 0 mod = 100 # iterate over all docs that need fixing orlist = [ {"syllables": {"$exists": False}}, {"rand": {"$exists": False}}, {"type": {"$exists": False}}, {"rhyme": {"$exists": False}}, ] ngrams = coll.find({"$or": orlist}) total = ngrams.count() for ngram in ngrams: upngram = False lastword = get_last_word(ngram) if "syllables" not in ngram: upngram = True ngram["syllables"] = count_syllables(lastword, cdict) if "rand" not in ngram: upngram = True ngram["rand"] = random.random() if "rhyme" not in ngram: upngram = True ngram["rhyme"] = get_rhyme(lastword, cdict) if not upngram: count += 1 continue update_ngram(ngram, db) upcount += 1 count += 1 if count % mod == 0: print "- {0} out of {1} analysed! Docs updated: {2}".format(count, total, upcount) sys.stdout.flush() print "* Database FIX procedure finished!"
def does_rhyme_unit_test(): dictionary = cmudict.dict() print does_rhyme('lol','bol',2,dictionary) print does_rhyme('cat','dog',2,dictionary) print does_rhyme('cat','bat',2,dictionary) print does_rhyme('cat','tot',2,dictionary) print does_rhyme('cat','tot',2,dictionary) print does_rhyme('hello','yellow',2,dictionary)
def does_rhyme_unit_test(): dictionary = cmudict.dict() print does_rhyme("lol", "bol", 2, dictionary) print does_rhyme("cat", "dog", 2, dictionary) print does_rhyme("cat", "bat", 2, dictionary) print does_rhyme("cat", "tot", 2, dictionary) print does_rhyme("cat", "tot", 2, dictionary) print does_rhyme("hello", "yellow", 2, dictionary)
def load_pronunciations(pronun_dictionary_name='cmudict', stress='unstressed'): """ note that we only support cmudict from nltk """ if stress not in STRESS_OPTIONS: raise TypeError try: cmu = cmudict.dict() except LookupError, AttributeError: cmu = load_cmu_pickle()
def __init__(self, wav_folder): self.phones = {} self.get_wavs(wav_folder) # Initialise pronunciation dictionary (always add entries for punctuation symbols) self.pron_dict = dict.fromkeys(['.', '?', '!'], 'double_sil') self.pron_dict[','] = 'sil' self.whole_dict = cmudict.dict() self.get_pron_dict(args.phrase)
def getMulti(): cmu=cmudict.dict(); rhymeToPros,pronunciationToWords=getDictionariesNeededForRhyming(cmu); print("rhymeToPros has "+str(len(rhymeToPros))+" items") print("pronunciationToWords has "+str(len(pronunciationToWords))+" items") rgs=[rhymeGroup(r,rhymeToPros,pronunciationToWords,syllabifier.syllabify) for r in rhymeToPros] multi=[r for r in rgs if (groupHasAtLeastOneDifference(r) and not(r.HasOneWord() or r.HasOnePronunciation()))] print("English has "+ str(len(multi))+" good rhyme groups\n") return multi
def num_syllables(word): d = cmudict.dict() if "-" in word: word2 = "".join(word.split("-")) if word2 in d: word = word2 else: return sum([num_syllables(w) for w in word.split("-")]) return list((len(list(y for y in x if isdigit(y[-1]))) for x in d[word.lower()]))[0]
def test_num_syllables(): s = cmudict.dict() tests = ['animal', 'i', '0', 1] expected = [3, 1, 1, 1] results = [] for t in tests: results.append(num_syllables(t, s)) rval = 1.0 * sum([e == r for e, r in zip(expected, results)]) / len(expected) return rval
def set_up_globals(ono=True): global dictionary dictionary = cmudict.dict() global stressed stressed = "1" global unstressed unstressed = "0" if ono: setup_ono_type_map()
def main(): '''Generates the tracery grammar for @my_cat_ebooks.''' logging.basicConfig( level='INFO', format='%(asctime)s %(levelname)8s [%(name)s] %(message)s', ) log.info('Loading CMU pronounciation dictionary') global cmu_pronounciations cmu_pronounciations = cmudict.dict() log.info('fruits') fruits = load_corpus("foods/fruits.json")["fruits"] log.info('body parts') body_parts = load_corpus("humans/bodyParts.json")["bodyParts"] log.info('amazing') amazing = load_corpus("words/encouraging_words.json")["encouraging_words"] log.info('superstar') superstar = [ln for s in wordnet.synsets('superstar') for ln in s.lemma_names()] pronouns = [ "[he:he][him:him][hes:he's]", "[he:she][him:her][hes:she's]", # TODO: reintroduce this, but it affects the conjugation of the occupation. # # they may not be an cleaner # they cleanses exultantly # # "[he:they][him:them][hes:they're]", "[he:it][him:it][hes:it's]", ] grammar = { "atrociously": adjly("atrocious"), "watermelon": fruits, "seven": "two three four five six seven eight nine ten eleven twelve".split(), "arm": body_parts, "amazing": amazing, "guitar": instruments(), "superstar": superstar, "setPronouns": pronouns, "setOccupation": occupations(), "stanza": [ textwrap.dedent(s).strip() for s, weight in stanza_weights.iteritems() for _ in xrange(weight) ], "origin": ["#[#setPronouns#][#setOccupation#]stanza#"], } log.info('writing grammar') with open('grammar.json', 'w') as f: json.dump(fp=f, indent=2, obj=grammar, sort_keys=True)
def recover_file_to_db(datafile): filename = datafile.rsplit("/")[-1] print "* Recovering file [{0}] into db...".format(filename) # connect to db mongodb_url = os.getenv("OPENSHIFT_MONGODB_DB_URL") client = pym.MongoClient(mongodb_url) db = client["shalk"] coll = db["ngrams"] base_data_dir = os.getenv("OPENSHIFT_DATA_DIR") if not base_data_dir: base_data_dir = "../data/" # initialize cmu dict nltk.data.path = ["{0}nltk/".format(base_data_dir)] cdict = cmudict.dict() count = 0 mod = 1000 # open file in reverse, and import it until we find the point where we stopped ngrams = [] for line in reversed(open(datafile).readlines()): ngram = get_ngram(line, cdict) if not ngram: continue # stop we find this ngram in the db already if find_one(ngram, db): # if `force`, we will iterate over all docs, but will ignore the ones that are already inserted if args.force: print "- ({0}) Ngram [{1}] already in the db, jumping to the next one...".format(filename, ngram) sys.stdout.flush() continue print "- ({0}) Ngram [{1}] already in the db, stopping the recovery!".format(filename, ngram) sys.stdout.flush() break ngrams.append(ngram) count += 1 if count % mod == 0: print "- ({0}) Inserted [{1}] ngrams into db...".format(filename, len(ngrams) * (count / mod)) print "- ({0}) {1} -> {2}".format(filename, ngrams[0], ngrams[-1]) sys.stdout.flush() insert_ngrams(ngrams, db) ngrams = [] print "- ({0}) Inserting last [{1}] ngrams into db...".format(filename, len(ngrams)) sys.stdout.flush() insert_ngrams(ngrams, db) print "* Finished importing file [{0}]!".format(filename)
def __init__(self, wav_folder): self.out = SA.Audio(rate=16000) # Create a blank audio for output, with a frequency of 16000 self.phones = self.get_wavs(wav_folder) # Add wavs as audio objects for each phoneme # and additional elements for pause breaks self.add_phone_break('comma - break', 250) self.add_phone_break('sentence - break', 500) self.word_phones_dict = cmudict.dict()
def __init__(self): self.dict = cmudict.dict() self.unknown_dict = {} for key in self.dict.keys(): if "'" in key: self.unknown_dict[key.replace("'", '')] = key if key.endswith('ing'): self.unknown_dict[key.replace('ing', 'in')] = key if 'every' in key: self.unknown_dict[key.replace('every', 'evry')] = key
def SyllableCalculator(text): d = cmudict.dict() counter = 0.0 tokens = re.findall("[A-Z]{2,}(?![a-z])|[A-Z][a-z]+(?=[A-Z])|[\'\w\-]+",text) for token in tokens: count = 1.0 if token.lower() in d.keys(): count =max([len(list(y for y in x if isdigit(y[-1]))) for x in d[token.lower()]]) counter = counter+count return counter
def __init__(self): """Instantiate a MinPairFinder. Note; To avoid unnecessarily repeating the work of loading the dict, call `get_instance` instead. """ if not self._dict: self._dict = cmudict.dict() if not self._rhymes_dict: self._rhymes_dict = self._get_rhymes_dict()
def text_to_phoneme_2(text): # different format for speech phoneme_dict = cmudict.dict() text = "" for word in raw_english: syllable = phoneme_dict[word][0] # there should be a counter somewhere for each phonemic version syllable = '-'.join(syllable) text = text + syllable + "- -" text = "-" + text return text
def get_rhymes(self, word): rhymes = [] word_pronounciations = cmudict.dict()[word] for word_pronounciation in word_pronounciations: for rhyme, rhyme_pronounciation in cmudict.entries(): if rhyme_pronounciation[-1] == word_pronounciation[-1]: rhymes.append(rhyme) return rhymes
def transcribeWord(word): dict = cmudict.dict() if word in dict: pronunciations = dict[word] syllables = pronunciations[0] pronunciation = ' '.join(syllables) return pronunciation else: return False
get_syllable_dict, get_track_str, ) from haikuincidence.utils.haiku_utils import count_syllables, get_haiku from haikuincidence.utils.text_utils import clean_text # get data to use for dealing with tweets track_str = get_track_str() ignore_tweet_list = get_ignore_tweet_list() syllable_dict = get_syllable_dict() emoticons_list = get_emoticons_list() # Use inflect to change digits to their English word equivalent inflect_p = inflect.engine() # Use the CMU dictionary to count syllables pronounce_dict = cmudict.dict() # guess_syl_method = "min" guess_syl_method = "mean" # guess_syl_method = "max" def get_syllable_count_and_haiku(text): count = count_syllables( text, inflect_p, pronounce_dict, syllable_dict, emoticons_list, guess_syl_method, )
import nltk from nltk.corpus import cmudict from nltk.tokenize import word_tokenize, wordpunct_tokenize, sent_tokenize import numpy as np import os.path import pandas as pd import pickle import random import re import requests import string import sys # dictionary to look up pronounciations master_dict = cmudict.dict() def save_obj(obj, fname): with open(fname, 'wb') as f: pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL) def load_obj(fname): with open(fname, 'rb') as f: return pickle.load(f) class Poem(object): def __init__(self, text, fname=None):
def get_syllables(word): d = cmudict.dict() return [ len(list(y for y in x if y[-1].isdigit())) for x in d[word.lower()] ][1]
brown.categories() # the categories of texts in the corpus brown.fileids(categories="hobbies") # text files in the category of "hobbies" brown.raw()[:100] # the raw text of the brown corpus (note tags) brown.sents()[0] # first sentence in the corpus brown.words()[:10] # first ten words in the corpus brown.tagged_sents()[ 0] # first sentence, each word tagged with part-of-speech info brown.tagged_words()[:50] # first fifty words, all tagged ##-- Specialized corpus: cmudict --## ## The Carnegie Mellon University Pronouncing Dictionary ## over 130,000 words, includes stress and variant pronunciations cmudict.dict()['idiosyncratic'] cmudict.dict()['caravan'] ### ~~~~~~~~~~~~~~~~~~~~~~~ ### ### 2. Tokenizing Sentences ### ### ~~~~~~~~~~~~~~~~~~~~~~~ ### ### ### - Breaking a sentence string into tokens (words, etc.) ### ### - A few common issues: ### Punctuation ### Contractions (e.g., can't) ### Non-alphabetical words sent = "I don't want a blueberry cake... I want a vanilla-almond cake!!!"
def starts_with_vowel_sound(word, pronunciations=cmudict.dict()): for syllables in pronunciations.get(word, []): return syllables[0][-1].isdigit() # use only the first one
if not exists(NLTK_DATA_PATH): for datum in NLTK_DATA: nltk.download(datum) from nltk.stem.snowball import EnglishStemmer import nltk.chunk as chunk from nltk.corpus import cmudict DIVIDER_TAG = ':' # nltk uses this to tag for ; and : # Set up some state that we'll use in the functions throughout this file: # TODO consider making a class that has modular stemmer/tokenizer stemmer = EnglishStemmer() tokenizer = nltk.data.load("tokenizers/punkt/english.pickle") cmudict_dict = cmudict.dict() # Some useful regexes: vowel_re = re.compile("[aeiouAEIOU]") vowel_phoneme_re = re.compile("AA|AE|AH|AO|AW|AY|EH|EY|ER|IH|IY|OW|OY|UH|UW") consonant_phoneme_re = re.compile( "^(?:B|D|G|JH|L|N|P|S|T|V|Y|ZH|CH|DH|F|HH|K|M|NG|R|SH|TH|W|Z)") # Helper predicates: is_vowel = partial(match, vowel_re) is_vowel_phoneme = partial(match, vowel_phoneme_re) is_consonant_phoneme = partial(match, consonant_phoneme_re) def word_to_phonemes(word): result = cmudict_dict.get(word.lower(), None)
from nltk.corpus import cmudict from HMM import unsupervised_HMM from helper import * # PREPROCESSING # text = open(os.path.join(os.getcwd(), 'data/shakespeare.txt')).read() text = open(os.path.join(os.getcwd(), 'data/allpoems.txt')).read() # visualization of whole data set wordcloud = text_to_wordcloud(text, title='Shakespeare') # TODO: extract words # - keep hyphenated words hyphenated # - some words could be tokenized as bigrams # - separate punctuation from words, and store them separately obs, obs_map = parse_observations(text) syllables = cmudict.dict() for punct in [".", ",", ":", ";", "!", "?"]: syllables.update({punct:[[]]}) # UNSUPERVISED LEARNING #Was 20 hmm8 = unsupervised_HMM(obs, 10, 100) # visualizations of sparsity of A, O as well as # visualizations of states as wordclouds visualize_sparsities(hmm8, O_max_cols=50) wordclouds = states_to_wordclouds(hmm8, obs_map) #This part only works in Jupyter Notebook anim = animate_emission(hmm8, obs_map, M=8) HTML(anim.to_html5_video())
def cut_tweet_to_syllables_unit_test(): dictionary = cmudict.dict() print cut_tweet_to_syllables( 'damn n***a look at all these syllables tho for real', 10, dictionary)
import nltk from nltk.corpus import brown from nltk.corpus import wordnet as wn from nltk.corpus import cmudict from nltk.stem import WordNetLemmatizer import csv #initialization needed for using WordNetLemmatizer and cmudict stemmer = nltk.PorterStemmer() pronunciations = cmudict.dict() lemma = WordNetLemmatizer() #return true if the word has more than one meaning def is_homo(word): if (len(wn.synsets(word)) > 1): return True return False #return true if the word has more than one pronunciation def is_hetero(word): if (word in pronunciations.keys()): if len(pronunciations[word]) > 1: return True return False #return true if given word is a verb with no meaning such as be, do, have def is_general(word): if (word == "was" or word == "been" or word == "are" or word == "did"
class AmericanEnglishLangContext(LanguageBoundsInterface): """Defines the properties and implementation of standard American English.""" ########## Variables ########## _cmu = cmudict.dict( ) # Pretrained phenome generation model. Created outside of methods because it is used over iteration(s) and is expensive to generate; TREAT THIS VALUE AS AN IMMUTABLE. _MULTI_TOKEN_INDICATOR = "_" # Character used to identify when a token has multiple words. This functionality is specific to a corpus. Must be changed if corpus is changed. _NULL_PHENOME_INDICATOR = "*NONE*" # Used by algorithm to indicate if a corressponding phemone could not be found for a token _SIMILARITY_THRESHOLD = 0.2 # The threshold that must be passed for a word to be considered similar. Scaled from 0-1. vowelphenomes = [ "AA", "AE", "AH", "AO", "AW", "AY", "AX", "AXR", "EH", "ER", "EY", "IH", "IX", "IY", "OW", "OY", "UH", "UW", "UX" ] # Contains all phenomes that produce vowel-related sounds for this language. ############################### def _getproperformattype(self, unformattoken): """Used to parse through the Wordnet sysnet-token return value to retrieve only relevant sections. Currently the only returns the word. In future implementations, this function may not be needed if the corpus has a function to return only the word as a string.""" name, junk = unformattoken.name().split(".", 1) return name def _getproperhandlemissingphenome(self, unknowntoken): """Takes a unknown-phenome (a token which could not be evaluated by CMUdict) and attempts to generate a phenome. If CMUdict or Wordnet implementation is changed this function MUST be changed.""" finaleval = [] # After various testing, it has been determined that calculating for two letters yields the most consistent results for unknown phenomes. tokenlen = len(unknowntoken) if tokenlen is 0: finaleval.append([self._NULL_PHENOME_INDICATOR]) elif tokenlen is 1: finaleval.append([unknowntoken.upper() ]) # The letter IS the phenome else: relevant = unknowntoken[:2] # get first two chars finalattempt = self._cmu.get(relevant, None) if finalattempt is None: # No possible phenome can be generated by this algorithm finaleval.append([self._NULL_PHENOME_INDICATOR]) elif finalattempt is list: finaleval.append(finalattempt) else: # 'finalattempt' is guareenteed to only be of type NONE, list, or list[list]. finaleval.extend( finalattempt ) # flatten list; tis step is necessary to maintain parsability return finaleval def _getproperhandlemultitoken(self, multitoken): """Takes a multi-word (a token with words seperated by '_' by Wordnet) and breaks it down into a format that can be evaluated by the CMUdict. If CMUdict or Wordnet implementation is changed this function MUST be changed.""" finaleval = [] individualtokens = multitoken.split(self._MULTI_TOKEN_INDICATOR) for token in individualtokens: # evaluate each token phenome indiviually; then represent multitoken for EACH phenome calculated, when returned to scanning. phenome = self._cmu.get(token.lower(), None) if phenome is list: finaleval.append(phenome) else: # 'phenome' is guareenteed to only be of type NONE, list, or list[list]. if phenome is None: phenome = self._getproperhandlemissingphenome(token) finaleval.extend( phenome ) # flatten list; this step is necessary to maintain parsability return finaleval def getphenomes(self, arg): """Returns all phenome-lists related to the token. ('context' is the representation of the phrase in collection form.)""" # uses CMUdict as the core processing algorithm. If CMUdict fails to find a match the function will predict a possible phenome for the token. # This function is guareenteed to return a value. generatephenome = self._cmu.get( arg.lower(), None ) # _cmu is defined globally above in "VARIABLES" section. Treat as an immutable. if generatephenome is None: if arg.__contains__( self._MULTI_TOKEN_INDICATOR ): # _MULTI_TOKEN_INDICATOR is defined globally above in "VARIABLES" section. Treat as an immutable. generatephenome = self._getproperhandlemultitoken(arg) else: # token is unknown by CMUdict generatephenome = self._getproperhandlemissingphenome(arg) # When multiple phenomes exist for same word, a list[list[str]] is generated return generatephenome def hypernyms(self, context, arg): """Returns all hypernyms related to the token. ('context' is the representation of the phrase in collection form.)""" # This function assumes the use of Wordnet. If Wordnet implementation changes, this function MUST change. eval = None interpretation = lesk(context, arg) if interpretation is not None: eval = map(self._getproperformattype, interpretation.hypernyms()) return eval def hyponyms(self, context, arg): """Returns all hyponyms related to the token.""" # This function assumes the use of Wordnet. If Wordnet implementation changes, this function MUST change. eval = None interpretation = lesk(context, arg) if interpretation is not None: eval = map(self._getproperformattype, interpretation.hyponyms()) return eval def messagefail(self, input): """Produces the fail message to print to users in this language if the process cannot return a value.""" built = " ".join(input) return ( "Your input: '" + built + "' was not able to be parsed under the conditions you desired. Please try new conditions or try a new phrase." ) def messageonlyresult(self, arg): """Produces a indicator message if only one result was possible from the input parameters given.""" return ("This is the only result processed from the given input:\n" + arg) def messagetopresult(self, resultlen, requestedresultcount): """Produces the top 'x' results message to users in this language if the process has multiple results.""" if resultlen < requestedresultcount: return ("Top " + str(resultlen) + " result(s):\n") else: return ("Top " + str(requestedresultcount) + " result(s):\n") def similarity(self, contextclues, arg1, arg2): """Returns a key-value pair for scoring similarity. [0] a bool that determines if the word is similar enough to satisfy language criteria and the score associated with the evaluation.""" # This function assumes the use of Wordnet. If Wordnet implementation changes, this function MUST change. evaluation = False score = 0 if arg1 is arg2: evaluation = True score = self._SIMILARITY_THRESHOLD # Penalizing score to prevent paraphrases from returning themselves else: contextA = lesk(contextclues, arg1) contextB = lesk(contextclues, arg2) if contextA and contextB: # Otherwise score will stay zero score = contextA.path_similarity(contextB) if score is not None and self._SIMILARITY_THRESHOLD <= score: evaluation = True return (evaluation, score) def split(self, arg): # Returns all non-whitespace tokens. return RegexpTokenizer('\w+|\$[\d\.]+|\S+').tokenize(arg)
def __init__(self): self.cmudict = cmudict.dict()
def do_sentences_rhyme_unit_test(): dictionary = cmudict.dict() print do_sentences_rhyme('oh hello', 'no yellow', dictionary) print do_sentences_rhyme('so the dog', 'log', dictionary) print do_sentences_rhyme('potato', 'wefo', dictionary) print do_sentences_rhyme('hog', 'log', dictionary)
for i, tag in enumerate(tag_list): spelling, sense, pron = tag info = '(' + str(sense) + ',' + pron + ')' loc = sent.find(spelling, current_loc) sent = sent[:loc + len(spelling)] + info + sent[loc + len(spelling):] current_loc = loc + len(spelling) + len(info) format_output['sentence'][row_idx] = sent print(sent, '--- Source: ', row['citation']) format_output.to_csv(filename, index=False, header=0) return ## Set up basic corpora pron_dict = cmudict.dict() brown_words = brown.tagged_words(tagset='universal') treebank_words = treebank.tagged_words(tagset='universal') nps_words = nps_chat.tagged_words(tagset='universal') corpus = brown_words + treebank_words + nps_words corpus = [(word.lower(), tag) for (word, tag) in corpus] stopset = set(stopwords.words('english')) ## Set up pretrained spaCy's word vector nlp = spacy.load('en_core_web_lg') ## Collect potential heteronyms data = get_het_from_corpus(corpus) ## Assign Wiktionary data to the potential heteronyms parser = init_wikparser() data = get_pronunciation(parser, data)
""" Classes and utilities for extracting haiku from arbitrary text and evaluating them based on some programmatically defined criteria """ import nltk import string from nltk.corpus import cmudict from nltk_util import syllables_en from haikus.evaluators import DEFAULT_HAIKU_EVALUATORS global WORD_DICT try: WORD_DICT = cmudict.dict() except LookupError: nltk.download('cmudict') WORD_DICT = cmudict.dict() class NonwordError(Exception): pass class HaikuText(object): """ A wrapper around some sequence of text """ def __init__(self, text=None): self._text = text def get_text(self): return self._text
def unit_test_count_syllables_sentence(): dictionary = cmudict.dict() print count_syllables_sentence('hello please check my syllables', dictionary) print count_syllables_sentence('checking some syllables right now dog', dictionary)
from __future__ import print_function from __future__ import division from scipy.integrate import quad import random import numpy as np import codecs import string import re import cPickle as cp from nltk import pos_tag from nltk import word_tokenize import collections from nltk.corpus import cmudict d = cmudict.dict() # dicionary of syllables from cmudict def check_unique(unique): out = open('unique.txt', 'w') for i in unique: out.write(i + '\n') out.close() # parses the text file 'shakespeare.txt' and adds each unique word to a dictionary, # WORD_DIC, with a unique index def parse(word_dic, index_dic): # open 'shakespeare.txt'
'date_of_publication', 'num_of_words', 'num_of_non_empty_lines', 'num_of_verses', 'avg_word_len', 'avg_line_len', 'avg_lines_per_verse', 'longest_line', 'words_per_line', 'largest_word', 'poem_stress_list_no_punct', 'chars_per_line' ]) # # Load JSON # with open(DATA_DIR + READ_JSON_FILE, 'r') as infh: cnt = 0 no_lines = 0 largest_word_corpus_ls = [] prondict = cmudict.dict() # for every poem-file-object for data in import_utilities.json_parse(infh): # process object cnt = cnt + 1 #print "cnt:", cnt labels_ls = [] author = 'UNKNOWN' title = 'UNKNOWN' # get the data out of json for idx, val in enumerate(data): #print idx, val
from nltk.corpus import cmudict from nltk.tokenize import RegexpTokenizer import os.path, time import datetime import common import rssNewsFetcher import pickle d = cmudict.dict() # get the CMU Pronouncing Dict phrasetokenizer = RegexpTokenizer(r"[\w| |\-|\'|\‘|\’|\$]+") wordtokenizer = RegexpTokenizer(r"[\w+|\']+") soundtokenizer = RegexpTokenizer(r"[A-Z]+") def hasNumbers(inputString): return any(char.isdigit() for char in inputString) def nsyl(word): """return the max syllable count in the case of multiple pronunciations""" lastsound = '' syllables = 0 try: if isinstance(d[word.lower()], list): word = d[word.lower()][0] for sound in word: if hasNumbers(sound): syllables = syllables + 1 lastsound = '' #append last sound lastsound += soundtokenizer.tokenize(sound)[0]
kyle_tokens = kyle_quotes_lower.apply(nltk.word_tokenize) #kyle_quotes.head() kyle_tokens_list = [ word for inner_list in list(kyle_tokens) for word in inner_list ] kyle_tokens_list = [ re.sub(r'[^A-Za-z0-9\'\-{1}]+$|\'$', 'punc', i) for i in kyle_tokens_list ] kyle_lexical_diversity = len(set(kyle_tokens_list)) / len(kyle_tokens_list) #print(kyle_lexical_diversity) #len(kyle_tokens_list)/len(kyle_tokens) top_characters = quotes_by_character.count()[ quotes_by_character.count().Line > 1000].index pro_dict = cmudict.dict() def get_character_params(data, character): character_quotes = data[data.Character == character].Line character_quotes_lower = character_quotes.apply(str.lower).apply( str.rstrip, '\n') character_tokens = character_quotes_lower.apply(nltk.word_tokenize) character_tokens_list = [ word for inner_list in list(character_tokens) for word in inner_list ] character_tokens_list = [ re.sub(r'[^A-Za-z0-9\'\-{1}]+$|\'$', 'punc', i) for i in character_tokens_list ]
## SETUP parser = argparse.ArgumentParser() parser.add_argument('-r', '--rhyme', dest='rhyme', help='provide a word to find its rhymes') parser.add_argument('-p', '--phones', dest='phones', help='provide a word/sentence to see its phonemes') args = parser.parse_args() ## START l_ents = cmudict.entries() # "list" of entries d_ents = cmudict.dict() # "dict" of entries # if they are using command line args, single usage mode if len(argv) > 1: if args.rhyme: get_rhymes(args.rhyme) if args.phones: get_phones(args.phones) # interactive mode with repeating menu and options else: inp = '' while inp != 'q': print('\n(1) Find phonemes\n(2) Find rhyming words\n(q) Quit\n')
sys.path.append(nlp_dir) import util VERSION_MAJOR = 0 VERSION_MINOR = 7 MODULE_NAME = 'termset_expander.py' global DEBUG DEBUG = False # load Spacy's English model nlp = spacy.load('en_core_web_sm') # initialize the CMU phoneme dictionary cmu_dict = cmudict.dict() # regexes for locating termsets in NLPQL files # line comment - match everything from // up to but NOT including the newline # also, don't match the // in a URL str_line_comment = r'(?<!http:)(?<!https:)//.*(?=\n)' regex_line_comment = re.compile(str_line_comment, re.IGNORECASE) # multiline comment str_multiline_comment = r'/\*.*?\*/' regex_multiline_comment = re.compile(str_multiline_comment, re.IGNORECASE | re.DOTALL) # a term is anything enclosed in double quotes str_term = r'\"[^"]+\"'
import Models from nltk.corpus import cmudict """ Global variables for the reading level project. This module holds the global variables for the project. Authors: Charles Billingsley Josh Getter Adam Stewart Josh Techentin """ # Main Globals dictionary = cmudict.dict() input_file = '' file_content = '' current_line_number = 0 full_input = '' total_words = 0 total_sentences = 0 total_syllables = 0 target_reading_level = '' shouldModify = False # ChangeLevel Globals target_reading_level = '' target_reading_score = Models.ReadingScoreRange()
def __syllables__(word): print "Doing syllables lookup for", word d = cmudict.dict() if word == '': return 0 return [len(list(y for y in x if isdigit(y[-1]))) for x in d[word]][0]
#!/usr/bin/python import re import inflect import hyphenator from nltk.corpus import cmudict d = cmudict.dict() #probably will need this p = inflect.engine() #this takes a word and separates its syllables so that they are hyphenated #also capitalized #if it takes a hyphenated word it should just return the hyphenated word (i have implemented this) def hyphenate_word(word): word = '-'.join(hyphenator.hyphenate_word(word)) if '--' in word: word = '-'.join(word.split('--')) return word #returns a hyphenated version def hyphenate_phrase(phrase): words = phrase.split(" ") returnme = [] for word in words: returnme.append(hyphenate_word(word)) print(" ".join(returnme)) def word_syllable_count(word): return hyphenate_word(word).count('-') + 1
#-*- coding: utf-8 -*- # Tools for working with poems # # Licensed under GPLv2 or later. from __future__ import print_function import json, os, re, sys from collections import defaultdict from string import ascii_lowercase from Levenshtein import distance from .countsyl import count_syllables try: from nltk.corpus import cmudict cmu = cmudict.dict() except: with open(os.path.join(os.path.dirname(__file__), 'cmudict/cmudict.json')) as json_file: cmu = json.load(json_file) def elided_d(word): if word[-2:] == "'d": return word[:-2] + "ed" return word def tokenize(poem): tokens = [] for line in poem.split('\n'): line = line.replace('-', ' ') # need to find a better tokenizer, but this works for now no_hyphens = line.replace('—', ' ') cleaned = re.sub(r'[^0-9a-zA-Z\s\']', '', no_hyphens) # keep apostrophes
from collections import Counter from nltk.corpus import words #check dictionary from nltk import pos_tag as posTag import emoji #pip install import re #elongation from autocorrect import spell #pip install #check spelling from nltk.tokenize import sent_tokenize #sentence tokenizer https://www.nltk.org/api/nltk.tokenize.html also see import csv #read file from datetime import datetime #convert unix time to human time from nltk.tokenize import RegexpTokenizer #remove puncutations from nltk import edit_distance as ed #check word spelling correction distance import urllib.request as urllib #ud convert url to unicode punctuations = RegexpTokenizer(r'\w+') from nltk.corpus import cmudict import math CMUdict = cmudict.dict() #syllable class preProcess(object): def __init__(self): ''' loads urban dictionary and emoji list ''' self.ud=self.urbanLoad() self.emojiList=self.emojiLoad() def chanCleaner(self,post): #clean 4archive posts ''' cleans 4chan posts by removing the initial disclaimer '''
import nltk nltk.download('cmudict') from nltk.corpus import cmudict import numpy as np d = cmudict.dict() def syllable_count(word): try: return np.min([ len(list(y for y in x if y[-1].isdigit())) for x in d[word.lower()] ]) except KeyError: #if word not found in cmudict return _syllables(word) def _syllables(word): #referred from stackoverflow.com/questions/14541303/count-the-number-of-syllables-in-a-word count = 0 vowels = 'aeiouy' word = word.lower() if word[0] in vowels: count += 1 for index in range(1, len(word)): if word[index] in vowels and word[index - 1] not in vowels: count += 1 if word.endswith('e'): count -= 1
from nltk.corpus import cmudict from pattern.en import parse, parsetree, wordnet, NOUN, pluralize from BasicModels import Error import os import settings import logging LOGGER = logging.getLogger("pattern.server") PRON = cmudict.dict() AEIOU = ['A', 'E', 'I', 'O', 'U'] #countabl features from celex def readNounList(fileName): nounList = open(fileName, "r") raw = nounList.read().splitlines() maps = dict() for line in raw: data = line.strip().split("\t") key = data[0] cop = data[1] if len(data) != 14: print "Read list wrong!" sys.exit(0) if maps.has_key(key): tmp = maps.get(key) if cop > tmp: maps[key] = data[1:] else: pass
import re import numpy as np import pandas as pd # cmudict的entries方法找出所有音素 import nltk from nltk.corpus import stopwords # 这个stopwords.words("english") from nltk.corpus import cmudict # import scikit-learn里面的两个计算tf-idf必要的类 from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfTransformer # In[618]: # 用cmudcit的hashmap版本,查询会更快 phonetic_check_dict = cmudict.dict() # In[619]: # cmudict.entries是一个list,里面的每个elements是一个tuple,tuple[0]是单词或字母,tuple[1]是对应的音素 # In[620]: def get_data(filename): data_origin = pd.read_csv(filename, encoding='utf-8') return data_origin # In[621]:
class SG: normalized_words = [] pronunciation_tokens = [] post_prosody = [] cmud = cmudict.dict() sound_dict = sound_dict_generator.Synth().diphones # THIS IS USED FOR TESTING #def __init__(self): #self.normalized_words = ['<beginning>', '<question>', 'hello', 'there', 'professor', '<break,comma,1>', 'how', 'are', # 'you', 'doing', '<break,question,2>', 'i', 'am', 'good', '<break,sent_end,2>', # '<exclamation>', 'This', 'is', 'so', 'amazing', '<break,exclamation,2>', '<end>'] # self.normalized_words = ['doctor', 'rabbits', 'email', 'is', 'i', 'l', 'u', 'v', 'c', 'a', 'r' 'r', # 'o' 't' 's', 'three', 'zero', 'five', 'at', 'g', 'mail', 'dot', 'c', 'o', 'm', '<break,sent_end,2>', 'you', # 'can', 'checkout', 'his', 'website', '<break,comma,1>', 'r', 'a', 'b', 'b', 'i', 't', 'd', 'r', 'dot', 'g', # 'o', 'v', '<break,sent_end,2>', 'he', 'uses', 'forty', 'milliliters', 'beakers', 'to', 'find', 'tilde', # 'volume', '<break,sent_end,2>', 'he', 'has', '<currency>', 'negative', 'three', 'dollars', 'in', 'his', # 'bank', 'account', '<break,sent_end,2>'] def __init__(self, n_w: list): self.normalized_words = n_w self.text_to_phoneme() self.prosody_analyzer() def text_to_phoneme(self): skip = 0 for w in self.normalized_words: # get the token from normalized_words if w in self.cmud: phone = self.cmud[w][0] # convert tokens to its phoneme form for i in range(len(phone)): phone[i] = re.sub("[^a-zA-Z\\s\-]", "", phone[i]).lower() self.pronunciation_tokens.append(phone) # add the phoneme form of the word to pronunciation_tokens elif w[0] == '<' and w[-1] == '>': self.pronunciation_tokens.append([w]) else: for i in range(len(w)): if skip > 0: skip -= 1 continue try: phone = self.cmud[w[i:i+5].lower()][0] skip = 4 except: try: phone = self.cmud[w[i:i+4].lower()][0] skip = 3 except: try: phone = self.cmud[w[i:i+3].lower()][0] skip = 2 except: try: phone = self.cmud[w[i:i+2].lower()][0] skip = 1 except: try: phone = self.cmud[w[i].lower()][0] except: pass for i in range(len(phone)): phone[i] = re.sub("[^a-zA-Z\\s\-]", "", phone[i]).lower() self.pronunciation_tokens.append(phone) # TODO: figure out what to do with words not in the cmu dictonary # Possibilities: should we get the root?, use the google converter? def prosody_analyzer(self): temp = [] for w in self.pronunciation_tokens: if w[0] == "<beginning>" or w[0] == "<end>": temp.append('pau') if w[0] == "<end>": temp.append(w[0]) elif w[0] == "<break,comma,1>": temp.append('pau') temp.append('pau') elif w[0] == "<break,semicolon,1.5>" or w[0] == "<break,colon,1.5>": temp.append('pau') temp.append('pau') temp.append('pau') elif w[0] == "<break,sent_end,2>" or w[0] == "<break,question,2>" or w[0] == "<break,exclamation,2>": temp.append('pau') temp.append('pau') temp.append(w[0]) temp.append('pau') elif w[0] == "<question>" or w[0] == "<exclamation>": temp.append(w[0]) elif w[0] == "<space>": continue else: for p in w: temp.append(p) for i in range(len(temp)): if temp[i] == "<exclamation>" or temp[i] == "<question>" or temp[i] == "<break,sent_end,2>" or temp[i] == "<break,question,2>" or temp[i] == "<break,exclamation,2>": self.post_prosody.append(temp[i]) continue if i != len(temp)-1: if temp[i+1] == "<exclamation>" or temp[i+1] == "<question>" or temp[i+1] == "<break,sent_end,2>" or temp[i+1] == "<break,question,2>" or temp[i+1] == "<break,exclamation,2>" or temp[i+1] == "<end>": if temp[i+1] == "<end>": self.post_prosody.append(temp[i+1]) else: self.post_prosody.append(temp[i] + '-' + temp[i + 2]) else: self.post_prosody.append(temp[i] + '-' + temp[i+1]) print(self.post_prosody)