Пример #1
0
def getTextScores(text, locale='en_GB', simplewordlist=[]):
    """
    Calculates several text scores based on a piece of text.
    A custom locale can be provided for the hyphenator, which
    maps to a Myspell hyphenator dictionary.  If the locale
    is a file descriptor or file path the dictionary at that
    path will be used instead of those in the default Myspell
    location.
    The simple word list should be provided in lower case. 
    """
    from nltk.tokenize import sent_tokenize
    from hyphenator import Hyphenator
    import re
    import os

    #check if the locale is supplied as a file
    if os.path.exists(locale):
      hyphenator = Hyphenator(locale)
    else:
      hyphenator = Hyphenator("/usr/share/myspell/hyph_" + locale + ".dic")
	    
    scores = {
              'sent_count': 0,              # nr of sentences
              'word_count': 0,              # nr of words
              'letter_count':0,             # nr of characters in words (no spaces)
              'syll_count': 0,              # nr of syllables
              'polysyllword_count': 0,      # nr of polysyllables (words with more than 3 syllables)
              'simpleword_count': 0,        # nr of simplewords (depends on provided list)
              'sentlen_average': 0,         # words per sentence
              'wordlen_average': 0,         # syllables per word
              'wordletter_average': 0,      # letters per word
              'wordsent_average': 0         # sentences per word
              }
    
    sentences = sent_tokenize(text)
    scores['sent_count'] = len(sentences)

    for s in sentences:
        words = re.findall(r'\w+', s.decode('utf8'), flags = re.UNICODE)
        scores['word_count'] = scores['word_count'] + len(words)

        for w in words:
            syllables_count = hyphenator.inserted(w).count('-') + 1
            scores['letter_count'] = scores['letter_count'] + len(w)
            scores['syll_count'] = scores['syll_count'] + syllables_count
            
            if syllables_count > 2:
                scores['polysyllword_count'] = scores['polysyllword_count'] + 1
            
            if simplewordlist:
                if w.lower() in simplewordlist:
                    scores['simpleword_count'] = scores['simpleword_count'] + 1


    scores['sentlen_average'] = scores['word_count'] / scores['sent_count']
    scores['wordlen_average'] = scores['syll_count'] / scores['word_count']
    scores['wordletter_average'] = scores['letter_count'] / scores['word_count']
    scores['wordsent_average'] = scores['sent_count'] / scores['word_count']
    return scores
Пример #2
0
def hyphenateText(text):
    """
    Add lyrics hyphenation to the selected text.
    """
    lang = askLanguage()
    if lang:
        from hyphenator import Hyphenator
        h = Hyphenator(hyphdicts[lang])
        return _wordSub(lambda m: h.inserted(m.group(), ' -- '), text)
Пример #3
0
def getTextScores(text, locale='en_GB', simplewordlist=[], smoggy=False):
    """
    Calculates several text scores based on a piece of text.
    A custom locale can be provided for the hyphenator, which
    maps to a Myspell hyphenator dictionary.
    The simple word list should be provided in lower case.
    """
    from nltk.tokenize import sent_tokenize
    from hyphenator import Hyphenator
    import re

    hyphenator = Hyphenator("/usr/share/myspell/hyph_" + locale + ".dic")
    scores = {
              'sent_count': 0,              # nr of sentences
              'word_count': 0,              # nr of words
              'letter_count':0,             # nr of characters in words (no spaces)
              'syll_count': 0,              # nr of syllables
              'polysyllword_count': 0,      # nr of polysyllables (words with more than 3 syllables)
              'simpleword_count': 0,        # nr of simplewords (depends on provided list)
              'sentlen_average': 0,         # words per sentence
              'wordlen_average': 0,         # syllables per word
              'wordletter_average': 0,      # letters per word
              'wordsent_average': 0         # sentences per word
              }

    sentences = sent_tokenize(text)
    sent_count = len(sentences)  # don't assign this to scores, as sentences may need to be recalculated
    if smoggy and sent_count > 30:  # see http://webpages.charter.net/ghal/SMOG_Readability_Formula_G._Harry_McLaughlin_%281969%29.pdf
        # get a sample of 10 sentences from the beginning, middle and the end of the text
        sentences = sentences[:10] + sentences[int(sent_count/2) -5:5+ int(sent_count/2)] + sentences[-10:]
    scores['sent_count'] = len(sentences)

    for s in sentences:
        words = re.findall(r'\w+', s.decode('utf8'), flags = re.UNICODE)
        scores['word_count'] = scores['word_count'] + len(words)

        for w in words:
            syllables_count = hyphenator.inserted(w).count('-') + 1
            scores['letter_count'] = scores['letter_count'] + len(w)
            scores['syll_count'] = scores['syll_count'] + syllables_count

            if syllables_count > 2:
                scores['polysyllword_count'] = scores['polysyllword_count'] + 1

            if simplewordlist:
                if w.lower() in simplewordlist:
                    scores['simpleword_count'] = scores['simpleword_count'] + 1


    scores['sentlen_average'] = scores['word_count'] / scores['sent_count']
    scores['wordlen_average'] = scores['syll_count'] / scores['word_count']
    scores['wordletter_average'] = scores['letter_count'] / scores['word_count']
    scores['wordsent_average'] = scores['sent_count'] / scores['word_count']
    return scores
Пример #4
0
def getTextScores(text, locale='en_GB', simplewordlist=[]):
    """
    Calculates several text scores based on a piece of text.
    A custom locale can be provided for the hyphenator, which
    maps to a Myspell hyphenator dictionary.
    The simple word list should be provided in lower case. 
    """
    from nltk.tokenize import sent_tokenize
    from hyphenator import Hyphenator
    import re

    hyphenator = Hyphenator("/usr/share/myspell/hyph_" + locale + ".dic")
    scores = {
        'sent_count': 0,  # nr of sentences
        'word_count': 0,  # nr of words
        'letter_count': 0,  # nr of characters in words (no spaces)
        'syll_count': 0,  # nr of syllables
        'polysyllword_count':
        0,  # nr of polysyllables (words with more than 3 syllables)
        'simpleword_count': 0,  # nr of simplewords (depends on provided list)
        'sentlen_average': 0,  # words per sentence
        'wordlen_average': 0,  # syllables per word
        'wordletter_average': 0,  # letters per word
        'wordsent_average': 0  # sentences per word
    }

    sentences = sent_tokenize(text)
    scores['sent_count'] = len(sentences)

    for s in sentences:
        words = re.findall(r'\w+', s.decode('utf8'), flags=re.UNICODE)
        scores['word_count'] = scores['word_count'] + len(words)

        for w in words:
            syllables_count = hyphenator.inserted(w).count('-') + 1
            scores['letter_count'] = scores['letter_count'] + len(w)
            scores['syll_count'] = scores['syll_count'] + syllables_count

            if syllables_count > 2:
                scores['polysyllword_count'] = scores['polysyllword_count'] + 1

            if simplewordlist:
                if w.lower() in simplewordlist:
                    scores['simpleword_count'] = scores['simpleword_count'] + 1

    scores['sentlen_average'] = scores['word_count'] / scores['sent_count']
    scores['wordlen_average'] = scores['syll_count'] / scores['word_count']
    scores[
        'wordletter_average'] = scores['letter_count'] / scores['word_count']
    scores['wordsent_average'] = scores['sent_count'] / scores['word_count']
    return scores
Пример #5
0
def hyphenate(text, mainwindow):
    """
    Ask the user which language to use.
    Returns None if the user cancels the dialog or no hyphenation pattern files
    could be found.
    """
    if not hyphdicts:
        KMessageBox.sorry(mainwindow, i18n(
            "Could not find any hyphenation dictionaries.\n\n"
            "Please install a package containing some and/or or configure the "
            "search path to find them in the Frescobaldi settings under "
            "\"Paths.\""))
        return
    
    conf = config("hyphenation")
    lang = conf.readEntry("lastused", "")
    langs = list(sorted(hyphdicts.keys()))
    index = lang in langs and langs.index(lang) or 0
    
    d = KDialog(mainwindow)
    d.setButtons(KDialog.ButtonCode(KDialog.Ok | KDialog.Cancel | KDialog.Help))
    d.setCaption(i18n("Hyphenate Lyrics Text"))
    d.setHelp("lyrics")
    layout = QVBoxLayout()
    d.mainWidget().setLayout(layout)
    layout.addWidget(QLabel(i18n("Please select a language:")))
    listbox = QListWidget()
    layout.addWidget(listbox)
    listbox.addItems(langs)
    listbox.setCurrentRow(index)
    listbox.setFocus()
    if d.exec_():
        lang = langs[listbox.currentRow()]
        conf.writeEntry("lastused", lang)
        conf.sync()
        # get hyphenator
        h = Hyphenator(hyphdicts[lang])
        return ly.rx.lyric_word.sub(lambda m: h.inserted(m.group(), ' -- '), text)
Пример #6
0
def get_hyphenator_for_language(language):
    """
    Create a Hyphenator for the given language. Uses English if the
    language is not found.

    >>> get_hyphenator_for_language('ru-ru') #doctest: +ELLIPSIS
    <hyphenator.Hyphenator object at ...
    """
    language = language.lower()

    # Fallback to English
    if not language in DICTIONARIES:
        language = 'en-us'
    path = os.path.join(os.path.dirname(__file__),
                        'dicts/%s.dic' % DICTIONARIES[language])
    return Hyphenator(path)
Пример #7
0
def process_dom(dom, lang):
    hyphenator = Hyphenator(lang)
    for tag in ('p', 'v', 'text-author', 'div'):
        for node in dom.xpath("//*[local-name() = '%s']" % tag):
            insert_hyphens(node, hyphenator)
    return dom
Пример #8
0
def getTextScores(text, locale='en_GB', simplewordlist=[]):
    """
    Calculates several text scores based on a piece of text.
    A custom locale can be provided for the hyphenator, which
    maps to a Myspell hyphenator dictionary.  If the locale
    is a file descriptor or file path the dictionary at that
    path will be used instead of those in the default Myspell
    location.
    The simple word list should be provided in lower case. 
    """
    from nltk.tokenize import sent_tokenize
    from hyphenator import Hyphenator
    import re
    import os

    #check if the locale is supplied as a file
    if os.path.exists(locale):
        hyphenator = Hyphenator(locale)
    else:
        hyphenator = Hyphenator("/usr/share/myspell/hyph_" + locale + ".dic")

    scores = {
        'sent_count': 0,  # nr of sentences
        'word_count': 0,  # nr of words
        'letter_count': 0,  # nr of characters in words (no spaces)
        'syll_count': 0,  # nr of syllables
        'polysyllword_count':
        0,  # nr of polysyllables (words with more than 3 syllables)
        'simpleword_count': 0,  # nr of simplewords (depends on provided list)
        'sentlen_average': 0,  # words per sentence
        'wordlen_average': 0,  # syllables per word
        'wordletter_average': 0,  # letters per word
        'wordsent_average': 0  # sentences per word
    }

    if isinstance(text, unicode):
        sentences = sent_tokenize(text.encode('utf8'))
    else:
        sentences = sent_tokenize(text)

    scores['sent_count'] = len(sentences)

    for s in sentences:
        words = re.findall(r'\w+',
                           unicode(s.decode('utf-8')),
                           flags=re.UNICODE)
        scores['word_count'] = scores['word_count'] + len(words)

        for w in words:
            syllables_count = hyphenator.inserted(w).count('-') + 1
            scores['letter_count'] = scores['letter_count'] + len(w)
            scores['syll_count'] = scores['syll_count'] + syllables_count

            if syllables_count > 2:
                scores['polysyllword_count'] = scores['polysyllword_count'] + 1

            if simplewordlist:
                if w.lower() in simplewordlist:
                    scores['simpleword_count'] = scores['simpleword_count'] + 1

    if scores['sent_count'] > 0:
        scores['sentlen_average'] = scores['word_count'] / scores['sent_count']

    if scores['word_count'] > 0:
        scores['wordlen_average'] = scores['syll_count'] / scores['word_count']
        scores['wordletter_average'] = scores['letter_count'] / scores[
            'word_count']
        scores[
            'wordsent_average'] = scores['sent_count'] / scores['word_count']

    return scores
Пример #9
0
 def process_chardata(self, text, line=False, par=False):
     # insert softhyphens
     return Hyphenator().hyphenate(text, u'\u00AD')