def getTextScores(text, locale='en_GB', simplewordlist=[]): """ Calculates several text scores based on a piece of text. A custom locale can be provided for the hyphenator, which maps to a Myspell hyphenator dictionary. If the locale is a file descriptor or file path the dictionary at that path will be used instead of those in the default Myspell location. The simple word list should be provided in lower case. """ from nltk.tokenize import sent_tokenize from hyphenator import Hyphenator import re import os #check if the locale is supplied as a file if os.path.exists(locale): hyphenator = Hyphenator(locale) else: hyphenator = Hyphenator("/usr/share/myspell/hyph_" + locale + ".dic") scores = { 'sent_count': 0, # nr of sentences 'word_count': 0, # nr of words 'letter_count':0, # nr of characters in words (no spaces) 'syll_count': 0, # nr of syllables 'polysyllword_count': 0, # nr of polysyllables (words with more than 3 syllables) 'simpleword_count': 0, # nr of simplewords (depends on provided list) 'sentlen_average': 0, # words per sentence 'wordlen_average': 0, # syllables per word 'wordletter_average': 0, # letters per word 'wordsent_average': 0 # sentences per word } sentences = sent_tokenize(text) scores['sent_count'] = len(sentences) for s in sentences: words = re.findall(r'\w+', s.decode('utf8'), flags = re.UNICODE) scores['word_count'] = scores['word_count'] + len(words) for w in words: syllables_count = hyphenator.inserted(w).count('-') + 1 scores['letter_count'] = scores['letter_count'] + len(w) scores['syll_count'] = scores['syll_count'] + syllables_count if syllables_count > 2: scores['polysyllword_count'] = scores['polysyllword_count'] + 1 if simplewordlist: if w.lower() in simplewordlist: scores['simpleword_count'] = scores['simpleword_count'] + 1 scores['sentlen_average'] = scores['word_count'] / scores['sent_count'] scores['wordlen_average'] = scores['syll_count'] / scores['word_count'] scores['wordletter_average'] = scores['letter_count'] / scores['word_count'] scores['wordsent_average'] = scores['sent_count'] / scores['word_count'] return scores
def hyphenateText(text): """ Add lyrics hyphenation to the selected text. """ lang = askLanguage() if lang: from hyphenator import Hyphenator h = Hyphenator(hyphdicts[lang]) return _wordSub(lambda m: h.inserted(m.group(), ' -- '), text)
def getTextScores(text, locale='en_GB', simplewordlist=[], smoggy=False): """ Calculates several text scores based on a piece of text. A custom locale can be provided for the hyphenator, which maps to a Myspell hyphenator dictionary. The simple word list should be provided in lower case. """ from nltk.tokenize import sent_tokenize from hyphenator import Hyphenator import re hyphenator = Hyphenator("/usr/share/myspell/hyph_" + locale + ".dic") scores = { 'sent_count': 0, # nr of sentences 'word_count': 0, # nr of words 'letter_count':0, # nr of characters in words (no spaces) 'syll_count': 0, # nr of syllables 'polysyllword_count': 0, # nr of polysyllables (words with more than 3 syllables) 'simpleword_count': 0, # nr of simplewords (depends on provided list) 'sentlen_average': 0, # words per sentence 'wordlen_average': 0, # syllables per word 'wordletter_average': 0, # letters per word 'wordsent_average': 0 # sentences per word } sentences = sent_tokenize(text) sent_count = len(sentences) # don't assign this to scores, as sentences may need to be recalculated if smoggy and sent_count > 30: # see http://webpages.charter.net/ghal/SMOG_Readability_Formula_G._Harry_McLaughlin_%281969%29.pdf # get a sample of 10 sentences from the beginning, middle and the end of the text sentences = sentences[:10] + sentences[int(sent_count/2) -5:5+ int(sent_count/2)] + sentences[-10:] scores['sent_count'] = len(sentences) for s in sentences: words = re.findall(r'\w+', s.decode('utf8'), flags = re.UNICODE) scores['word_count'] = scores['word_count'] + len(words) for w in words: syllables_count = hyphenator.inserted(w).count('-') + 1 scores['letter_count'] = scores['letter_count'] + len(w) scores['syll_count'] = scores['syll_count'] + syllables_count if syllables_count > 2: scores['polysyllword_count'] = scores['polysyllword_count'] + 1 if simplewordlist: if w.lower() in simplewordlist: scores['simpleword_count'] = scores['simpleword_count'] + 1 scores['sentlen_average'] = scores['word_count'] / scores['sent_count'] scores['wordlen_average'] = scores['syll_count'] / scores['word_count'] scores['wordletter_average'] = scores['letter_count'] / scores['word_count'] scores['wordsent_average'] = scores['sent_count'] / scores['word_count'] return scores
def getTextScores(text, locale='en_GB', simplewordlist=[]): """ Calculates several text scores based on a piece of text. A custom locale can be provided for the hyphenator, which maps to a Myspell hyphenator dictionary. The simple word list should be provided in lower case. """ from nltk.tokenize import sent_tokenize from hyphenator import Hyphenator import re hyphenator = Hyphenator("/usr/share/myspell/hyph_" + locale + ".dic") scores = { 'sent_count': 0, # nr of sentences 'word_count': 0, # nr of words 'letter_count': 0, # nr of characters in words (no spaces) 'syll_count': 0, # nr of syllables 'polysyllword_count': 0, # nr of polysyllables (words with more than 3 syllables) 'simpleword_count': 0, # nr of simplewords (depends on provided list) 'sentlen_average': 0, # words per sentence 'wordlen_average': 0, # syllables per word 'wordletter_average': 0, # letters per word 'wordsent_average': 0 # sentences per word } sentences = sent_tokenize(text) scores['sent_count'] = len(sentences) for s in sentences: words = re.findall(r'\w+', s.decode('utf8'), flags=re.UNICODE) scores['word_count'] = scores['word_count'] + len(words) for w in words: syllables_count = hyphenator.inserted(w).count('-') + 1 scores['letter_count'] = scores['letter_count'] + len(w) scores['syll_count'] = scores['syll_count'] + syllables_count if syllables_count > 2: scores['polysyllword_count'] = scores['polysyllword_count'] + 1 if simplewordlist: if w.lower() in simplewordlist: scores['simpleword_count'] = scores['simpleword_count'] + 1 scores['sentlen_average'] = scores['word_count'] / scores['sent_count'] scores['wordlen_average'] = scores['syll_count'] / scores['word_count'] scores[ 'wordletter_average'] = scores['letter_count'] / scores['word_count'] scores['wordsent_average'] = scores['sent_count'] / scores['word_count'] return scores
def hyphenate(text, mainwindow): """ Ask the user which language to use. Returns None if the user cancels the dialog or no hyphenation pattern files could be found. """ if not hyphdicts: KMessageBox.sorry(mainwindow, i18n( "Could not find any hyphenation dictionaries.\n\n" "Please install a package containing some and/or or configure the " "search path to find them in the Frescobaldi settings under " "\"Paths.\"")) return conf = config("hyphenation") lang = conf.readEntry("lastused", "") langs = list(sorted(hyphdicts.keys())) index = lang in langs and langs.index(lang) or 0 d = KDialog(mainwindow) d.setButtons(KDialog.ButtonCode(KDialog.Ok | KDialog.Cancel | KDialog.Help)) d.setCaption(i18n("Hyphenate Lyrics Text")) d.setHelp("lyrics") layout = QVBoxLayout() d.mainWidget().setLayout(layout) layout.addWidget(QLabel(i18n("Please select a language:"))) listbox = QListWidget() layout.addWidget(listbox) listbox.addItems(langs) listbox.setCurrentRow(index) listbox.setFocus() if d.exec_(): lang = langs[listbox.currentRow()] conf.writeEntry("lastused", lang) conf.sync() # get hyphenator h = Hyphenator(hyphdicts[lang]) return ly.rx.lyric_word.sub(lambda m: h.inserted(m.group(), ' -- '), text)
def get_hyphenator_for_language(language): """ Create a Hyphenator for the given language. Uses English if the language is not found. >>> get_hyphenator_for_language('ru-ru') #doctest: +ELLIPSIS <hyphenator.Hyphenator object at ... """ language = language.lower() # Fallback to English if not language in DICTIONARIES: language = 'en-us' path = os.path.join(os.path.dirname(__file__), 'dicts/%s.dic' % DICTIONARIES[language]) return Hyphenator(path)
def process_dom(dom, lang): hyphenator = Hyphenator(lang) for tag in ('p', 'v', 'text-author', 'div'): for node in dom.xpath("//*[local-name() = '%s']" % tag): insert_hyphens(node, hyphenator) return dom
def getTextScores(text, locale='en_GB', simplewordlist=[]): """ Calculates several text scores based on a piece of text. A custom locale can be provided for the hyphenator, which maps to a Myspell hyphenator dictionary. If the locale is a file descriptor or file path the dictionary at that path will be used instead of those in the default Myspell location. The simple word list should be provided in lower case. """ from nltk.tokenize import sent_tokenize from hyphenator import Hyphenator import re import os #check if the locale is supplied as a file if os.path.exists(locale): hyphenator = Hyphenator(locale) else: hyphenator = Hyphenator("/usr/share/myspell/hyph_" + locale + ".dic") scores = { 'sent_count': 0, # nr of sentences 'word_count': 0, # nr of words 'letter_count': 0, # nr of characters in words (no spaces) 'syll_count': 0, # nr of syllables 'polysyllword_count': 0, # nr of polysyllables (words with more than 3 syllables) 'simpleword_count': 0, # nr of simplewords (depends on provided list) 'sentlen_average': 0, # words per sentence 'wordlen_average': 0, # syllables per word 'wordletter_average': 0, # letters per word 'wordsent_average': 0 # sentences per word } if isinstance(text, unicode): sentences = sent_tokenize(text.encode('utf8')) else: sentences = sent_tokenize(text) scores['sent_count'] = len(sentences) for s in sentences: words = re.findall(r'\w+', unicode(s.decode('utf-8')), flags=re.UNICODE) scores['word_count'] = scores['word_count'] + len(words) for w in words: syllables_count = hyphenator.inserted(w).count('-') + 1 scores['letter_count'] = scores['letter_count'] + len(w) scores['syll_count'] = scores['syll_count'] + syllables_count if syllables_count > 2: scores['polysyllword_count'] = scores['polysyllword_count'] + 1 if simplewordlist: if w.lower() in simplewordlist: scores['simpleword_count'] = scores['simpleword_count'] + 1 if scores['sent_count'] > 0: scores['sentlen_average'] = scores['word_count'] / scores['sent_count'] if scores['word_count'] > 0: scores['wordlen_average'] = scores['syll_count'] / scores['word_count'] scores['wordletter_average'] = scores['letter_count'] / scores[ 'word_count'] scores[ 'wordsent_average'] = scores['sent_count'] / scores['word_count'] return scores
def process_chardata(self, text, line=False, par=False): # insert softhyphens return Hyphenator().hyphenate(text, u'\u00AD')