def to_haiku(h): idx = 0 count = 0 h = h.split() haiku = [] while count < 5 and idx < len(h): count += syllables.count(h[idx]) haiku.append(str(h[idx])) idx += 1 haiku.append(' / ') count = 0 while count < 7 and idx < len(h): count += syllables.count(h[idx]) haiku.append(str(h[idx])) idx += 1 haiku.append(' / ') count = 0 while count < 5 and idx < len(h): count += syllables.count(h[idx]) haiku.append(str(h[idx])) idx += 1 if idx < len(h): haiku.append(' || ') while idx < len(h): haiku.append(str(h[idx])) idx += 1 elif count < 5: haiku.append(' | ') return ' '.join(haiku)
def test_count(self): test_string = "reflect respect recline reduce obsessively demonstrate baseball cloud brother cobblestone " + \ "complete conspire conflict estuary" # Syllable counts: 2 2 2 2 4 3 2 1 2 3 2 2 2 raw_cmu = conversion.get_cmu(test_string.split(" ")) expected = [2, 2, 2, 2, 4, 3, 2, 1, 2, 3, 2, 2, 2, 4] for i, word in enumerate(raw_cmu): self.assertEqual(syllables.count(word[0]), expected[i]) # test some examples with hiatus test_hiatus = "duo rio maria created misery harry" # syllable counts: 2 2 3 3 3 2 hiatus_counts = [2, 2, 3, 3, 3, 2] raw_cmu_hiatus = conversion.get_cmu(test_hiatus.split(" ")) for j, word in enumerate(raw_cmu_hiatus): self.assertEqual(syllables.count(word[0]), hiatus_counts[j])
def get_complex_word_count(self): if self.complex_word_count is None: self.complex_word_count = 0 for word in self.get_words(): if syllables.count(word) >= 3: self.complex_word_count += 1 return self.complex_word_count
def count_syllables(word): #d = cmudict.dict() #try: #count = [len(list(y for y in x if isdigit(y[-1]))) for x in d[word.lower()]][0] #except KeyError: #return syllables.count(word) ##print word, count #return count return syllables.count(word)
def haikuness(words): words = words.split() count = 0 idx = 0 err = 0 while count < 5 and idx < len(words): count += syllables.count(words[idx]) idx += 1 err += abs(5 - count) count = 0 while count < 7 and idx < len(words): count += syllables.count(words[idx]) idx += 1 err += abs(7 - count) count = 0 while count < 5 and idx < len(words): count += syllables.count(words[idx]) idx += 1 err += abs(5 - count) while idx < len(words): err += syllables.count(words[idx]) idx += 1 return err
def count_syllables(sentance, debug=False): # first lets strip out punctuation and emotive marks count = 0 if debug: print('received sentance: %s' % sentance) sentance = format_input(sentance) if debug: print('formatted sentance: %s' % sentance) words = [w for w in sentance.split() if w.isalpha()] if debug: print('extracted words: %s' % repr(words)) nonwords = [w for w in sentance.split() if not w.isalpha()] if nonwords: print('found nonwords: %s' % repr(words)) for w in words: if is_camel(w): sylls = count_syllables(de_camel(w)) else: sylls = syllables.count(w) count += sylls if debug: print('%s\t\t\t%d' %(w, sylls)) if debug: print('total\t\t\t%d' % count) return count
def find_stress(word, type="all"): """Convert stress marking numbers from CMU into actual stress markings :param word - the CMU word string to be evaluated for stress markings :param type - type of stress to be evaluated (primary, secondary, or both)""" syll_count = syllables.count(word) if (not word.startswith("__IGNORE__")) and syll_count > 1: symbols = word.split(' ') stress_map = stress_type(type) new_word = [] clusters = ["sp", "st", "sk", "fr", "fl"] stop_set = [ "nasal", "fricative", "vowel" ] # stop searching for where stress starts if these are encountered # for each CMU symbol for c in symbols: # if the last character is a 1 or 2 (that means it has stress, and we want to evaluate it) if c[-1] in stress_map.keys(): # if the new_word list is empty if not new_word: # append to new_word the CMU symbol, replacing numbers with stress marks new_word.append( re.sub("\d", "", stress_map[re.findall("\d", c)[0]] + c)) else: stress_mark = stress_map[c[-1]] placed = False hiatus = False new_word = new_word[:: -1] # flip the word and backtrack through symbols for i, sym in enumerate(new_word): sym = re.sub("[0-9ˈˌ]", "", sym) prev_sym = re.sub("[0-9ˈˌ]", "", new_word[i - 1]) prev_phone = phones[re.sub("[0-9ˈˌ]", "", new_word[i - 1])] if phones[sym] in stop_set or (i > 0 and prev_phone == "stop") or sym in [ "er", "w", "j" ]: if sym + prev_sym in clusters: new_word[i] = stress_mark + new_word[i] elif not prev_phone == "vowel" and i > 0: new_word[i - 1] = stress_mark + new_word[i - 1] else: if phones[sym] == "vowel": hiatus = True new_word = [ stress_mark + re.sub("[0-9ˈˌ]", "", c) ] + new_word else: new_word[i] = stress_mark + new_word[i] placed = True break if not placed: if new_word: new_word[ len(new_word) - 1] = stress_mark + new_word[len(new_word) - 1] new_word = new_word[::-1] if not hiatus: new_word.append(re.sub("\d", "", c)) hiatus = False else: if c.startswith("__IGNORE__"): new_word.append(c) else: new_word.append(re.sub("\d", "", c)) return ' '.join(new_word) else: if word.startswith("__IGNORE__"): return word else: return re.sub("[0-9]", "", word)
def analyze_text(text): print "test" paragraph_regex = re.compile("\\n\\s*\\n") all_paragraphs = re.split(paragraph_regex, text) all_paragraphs = filter(lambda x: len(x) > 0, all_paragraphs) def slice(n, words): n_gram = list(islice(words, n, 4)) return " ".join(n_gram) def nsyl(word): return [ len(list(y for y in x if isdigit(y[-1]))) for x in d[word.lower()] ] last_four_words = deque([]) paragraph_lengths = [] sentence_lengths = [] avg_sentence_length_per_para = [] num_sentences_per_para = [] entropies = [] syllables_counter = defaultdict(int) parts_of_speech_counter = defaultdict(int) word_counter = defaultdict(int) two_gram_counter = defaultdict(int) three_gram_counter = defaultdict(int) four_gram_counter = defaultdict(int) for paragraph in all_paragraphs: #extract_paragraph_topics() Google News and Wikipedia # distance from previous paragraph #distance from previous N paragraph paragraph_lengths.append(len(paragraph)) entropies.append( entropy.shannon_entropy(paragraph.encode('utf-8').strip())) num_sentences = 0 for sentence in sent_tokenize(paragraph): num_sentences += 1 paragraph_sent_lengths = [] paragraph_sent_lengths.append(len(sentence)) avg_sentence_length_per_para.append( sum(paragraph_sent_lengths) / float(len(paragraph_sent_lengths))) sentence_lengths.append(len(sentence)) num_sentences_per_para.append(num_sentences) words = word_tokenize(paragraph) pos = pos_tag(words) for word, p in pos: syllables_counter[str(syllables.count(word))] += 1 word_counter[word] += 1 if word is not None: last_four_words.append(word) parts_of_speech_counter[p] += 1 if len(last_four_words) > 4: last_four_words.popleft() four_gram_counter[slice(0, last_four_words)] += 1 if len(last_four_words) >= 2: two_gram_counter[slice(2, last_four_words)] += 1 if len(last_four_words) >= 3: three_gram_counter[slice(1, last_four_words)] += 1 all_syllables = [] for key in syllables_counter.keys(): all_syllables.extend([float(key)] * syllables_counter[key]) total_parts_of_speech = 0 for key in parts_of_speech_counter: total_parts_of_speech += parts_of_speech_counter[key] df1 = pd.DataFrame({ 'average_paragraph_length': [np.mean(paragraph_lengths)], 'average_sentence_lengths': [np.mean(sentence_lengths)], 'avg_entropies': [np.mean(entropies)], 'avg_syllables': [np.mean(all_syllables)] }) for part_of_speech in parts_of_speech_counter.keys(): df1[part_of_speech + "_prop"] = parts_of_speech_counter[part_of_speech] / float( total_parts_of_speech) total_words = 0 for key in word_counter: total_words += word_counter[key] for stopword in STOPWORDS: df1[stopword + "_prop"] = word_counter[stopword] / float(total_words) return df1
# distance from previous N paragraph paragraph_lengths.append(len(paragraph)) entropies.append(entropy.shannon_entropy(paragraph.encode("utf-8"))) num_sentences = 0 for sentence in sent_tokenize(paragraph): num_sentences += 1 paragraph_sent_lengths = [] paragraph_sent_lengths.append(len(sentence)) avg_sentence_length_per_para.append( sum(paragraph_sent_lengths) / float(len(paragraph_sent_lengths))) sentence_lengths.append(len(sentence)) num_sentences_per_para.append(num_sentences) words = word_tokenize(paragraph) pos = pos_tag(words) for word, p in pos: syllables_counter[str(syllables.count(word))] += 1 word_counter[word] += 1 if word is not None: last_four_words.append(word) parts_of_speech_counter[p] += 1 if len(last_four_words) > 4: last_four_words.popleft() four_gram_counter[slice(0, last_four_words)] += 1 if len(last_four_words) >= 2: two_gram_counter[slice(2, last_four_words)] += 1 if len(last_four_words) >= 3: three_gram_counter[slice(1, last_four_words)] += 1 # N grams # Stop Words # Syllable words
def change(self, i=None): if i is None: i = self.d.draw() self.value = i self.str = mydict[i] self.syllables = syllables.count(mydict[self.value])
def __init__(self, mydict): self.value = mydict.draw() self.d = mydict self.syllables = syllables.count(mydict[self.value]) self.str = mydict[self.value]
def get_total_syllable_count(self): if self.total_syllable_count is None: self.total_syllable_count = 0 for word in self.get_words(): self.total_syllable_count += syllables.count(word) return self.total_syllable_count