예제 #1
0
def hyphenate(value, arg=None, autoescape=None):
    if autoescape:
        esc = conditional_escape
    else:
        esc = lambda x: x
    
    if arg:
        args = arg.split(u',')
        code = args[0]
        if len(args) > 1:
            minlen = int(args[1])
        else:
            minlen = 5
    else:
        code = settings.LANGUAGE_CODE
    s = code.split(u'-')
    lang = s[0].lower() + u'_' + s[1].upper()
    
    if not dictools.is_installed(lang): 
        dictools.install(lang)
        
    h = hyphenator(lang)
    new = []
    for word in value.split(u' '):
        if len(word) > minlen and word.isalpha():
            new.append(u'­'.join(h.syllables(word)))
        else:
            new.append(word)
    
    result = u' '.join(new)
    return mark_safe(result)
def hyphenate(value, arg=None, autoescape=None):
    # Default minimal length
    minlen = 6

    if arg:
        args = arg.split(u',')
        code = args[0]

        # Override minimal length, if specified
        if len(args) > 1:
            minlen = int(args[1])
    else:
        # No language specified, use Django's current
        code = get_language()

    # Normalize the locale code, ignoring a potential encoding suffix
    lang = locale.normalize(code).split('.')[0]

    # Make sure the proper language is installed
    if not dictools.is_installed(lang):
        dictools.install(lang)

    h = Hyphenator(lang)
    new = []
    for word in value.split(u' '):
        if len(word) > minlen and word.isalpha():
            new.append(u'­'.join(h.syllables(word)))
        else:
            new.append(word)

    result = u' '.join(new)
    return mark_safe(result)
예제 #3
0
def syllablize(poem):
    # syllablizer setup
    if not is_installed(language): install(language)
    hyph = Hyphenator(language)

    # output dict to send back through API
    output = []

    for line in poem:
        # list of words in line
        words = line.split()
        syllablized_line = []

        for word in words:
            syls = hyph.syllables(word)

            new_word = ""

            if len(syls) == 0:
                new_word = word
            else:
                for syl in syls:
                    new_word += syl
                    new_word += " "

            syllablized_line.append(new_word.strip())

        if len(syllablized_line) > 0:
            output.append(syllablized_line)

    return output
예제 #4
0
def by_syllable(input_gen, lang, install_lang_p):
    if install_lang_p and not dictools.is_installed(lang):
        dictools.install(lang)

    hyphenator = Hyphenator(lang)

    for word in input_gen:
        syllables = hyphenator.syllables(word)
        logging.debug("syllables: {}".format(syllables))
        for syllable in syllables:
            yield syllable
    def syllable_pos_setup(self):
        """Sets up syllables and POS tagging"""
        en_list = ['en_CA', 'en_PH', 'en_NA', 'en_NZ', 'en_JM', 'en_BS', 'en_US',
                   'en_IE', 'en_MW', 'en_IN', 'en_BZ', 'en_TT', 'en_ZA', 'en_AU',
                   'en_GH', 'en_ZW', 'en_GB']

        for lang in en_list:
            if not dictools.is_installed(lang): dictools.install(lang)

        self.cmu_dict = cmudict.dict()

        # sets up POS
        try:
            nltk.pos_tag(['test'])
            self.pos_tag = nltk.pos_tag
        except urllib2.URLError:
            PICKLE = "averaged_perceptron_tagger.pickle"
            AP_MODEL_LOC = 'file:' + str(find('taggers/averaged_perceptron_tagger/' + PICKLE))
            tagger = PerceptronTagger(load=False)
            tagger.load(AP_MODEL_LOC)
            self.pos_tag = tagger.tag

        self.tag_dict = {'NN': 'Noun', 'FW': 'Noun', 'JJ': 'Adjective', 'VB': 'Verb',
                         'IN': 'Preposition', 'CC': 'Conjunction',
                         'RP': 'Connector', 'TO': 'Connector', 'MD': 'Connector',
                         'RB': 'Adverb', 'WR': 'Wh-adverb',
                         'DT': 'DetPro', 'WD': 'DetPro', 'PD': 'DetPro', 'PR': 'DetPro', 'WP': 'DetPro',
                         'CD': 'Cardinal',
                         'EX': 'Existential there'}

        ##        self.tag_dict={'NN':'Noun', 'JJ':'Adjective','RB':'Adverb','VB':'Verb',
        ##          'IN':'Preposition','PR':'Pronoun','CC':'Conjunction',
        ##          'RP':'Particle','WR':'Wh-adverb','DT':'Determiner',
        ##          'TO':'To','MD':'Modal Aux','CD':'Cardinal', 'PD':'Predeterminer',
        ##          'WD':'Wh-determiner', 'WP':'Wh-pronoun','EX':'Existential there'}

        # POS which are allowed to happen twice in a row
        self.pos_double = []  # ['Noun','Adjective']

        # POS which can only occur sequentially
        # i.e. an Adverb must occur in fron of a verb
        self.pos_lead = {'Adverb': ['Verb'], 'Pronoun': ['Noun'], 'Adjective': ['Noun'],
                         'Preposition': ['Noun', 'Pronoun']}

        # POS which cannot occur sequentially
        # i.e. a preposition cannot come before a verb
        self.pos_restrict_lead = {'Preposition': 'Verb',}

        return
예제 #6
0
def hyphenate(value, arg=None, autoescape=None):
    if autoescape:
        esc = conditional_escape
    else:
        esc = lambda x: x
    
    minlen = 7

    if arg:
        args = arg.split(u',')
        code = args[0]
        if len(args) > 1:
            minlen = int(args[1])
    else:
        code = settings.LANGUAGE_CODE

    #
    # Looks like this is assuming that the language code will arrive as 'xx-
    # YY'. In our case, it will arrive as simply 'en', so we MUST expand this
    # into a locale in order to work with PyHyphen.
    #

    # TODO: This should probably be a lookup against a dict in settings?

    s = code.split(u'-')

    if len(s) == 1:
        if s[0] == 'en':
            s.append(u'US')
        elif s[0] == 'bg':
            s.append(u'BG')

    lang = s[0].lower() + u'_' + s[1].upper()
    
    if not dictools.is_installed(lang): 
        dictools.install(lang)
        
    h = Hyphenator(lang)

    new = []

    for word in value.split(u' '):
        if len(word) > minlen and word.isalpha():
            new.append(u'­'.join(h.syllables(word)))
        else:
            new.append(word)
    
    result = u' '.join(new)
    return mark_safe(result)
 def _set_lang_dict(self):
     if self.dict_download:
         try:
             if not is_installed(self.lang_code):
                 if self.verbose:
                     print(Msg.DICT_INSTALL(self.lang_code))
                 install(self.lang_code)
             self.lang_dict = Hyphenator(self.lang_code)
         except:
             pass
         if self.verbose:
             if is_installed(self.lang_code):
                 print(Msg.DICT_INSTALLED(self.lang_code))
             else:
                 print(Msg.DICT_INSTALL_FAILED(self.lang_code))
예제 #8
0
 def __init__ (self, 
               language="EN",
               minWordLength=4,
               quality=8,
               hyphenDir=None,
               **options
              ):
     ExplicitHyphenator.__init__(self,language=language,minWordLength=minWordLength,**options)
     if hyphenDir is None:
         hyphenDir = os.path.join(os.path.split(__file__)[0], "..", "dict")
     fname = os.path.join(hyphenDir, "hyph_%s.dic" % language)
     if not dictools.is_installed(language, directory=hyphenDir):
         dictools.install(language, directory=hyphenDir)
         print "installed dictionary for %s into %s" % (language, hyphenDir)
     self.hnj = pyhyphen.hyphenator(language, directory=hyphenDir)
     self.quality = quality
예제 #9
0
 def __init__(self,
              language="EN",
              minWordLength=4,
              quality=8,
              hyphenDir=None,
              **options):
     ExplicitHyphenator.__init__(self,
                                 language=language,
                                 minWordLength=minWordLength,
                                 **options)
     if hyphenDir is None:
         hyphenDir = os.path.join(os.path.split(__file__)[0], "..", "dict")
     fname = os.path.join(hyphenDir, "hyph_%s.dic" % language)
     if not dictools.is_installed(language, directory=hyphenDir):
         dictools.install(language, directory=hyphenDir)
         print "installed dictionary for %s into %s" % (language, hyphenDir)
     self.hnj = pyhyphen.hyphenator(language, directory=hyphenDir)
     self.quality = quality
예제 #10
0
import os
import json
import re

from collections import OrderedDict

from bs4 import BeautifulSoup
from hyphen import Hyphenator, dictools

from flask import current_app

if not dictools.is_installed("ru_RU"):
    dictools.install("ru_RU")

RU_HYPHENATOR = Hyphenator("ru_RU")
SOFT_HYPHEN = u"\u00AD"
STRIP_WHITESPACE = re.compile("\w+", re.MULTILINE | re.UNICODE)
HYPHENATOR_BLACKLIST_TAGS = ("code", "tt", "pre", "head", "title", "script", "style", "meta", "object", "embed",
                             "samp", "var", "math", "select", "option", "input", "textarea", "span", "iframe")


def no_shy(text):
    return text.replace(SOFT_HYPHEN, "")


def get_data(module):
    index_path = os.path.join(current_app.root_path, "data", module, "posts.json")

    with open(index_path, "r", encoding="utf8") as index_file:
        return OrderedDict([(p[0], p) for p in json.load(index_file)])
예제 #11
0
파일: setup.py 프로젝트: mbevila/pyhyphen
        # Write the new config.py
        codecs.open(mod_path, 'w', 'utf8').write(new_content)
        py_compile.compile(mod_path)
        print("Done.")

        # Delete any existing dict registry file
        reg_file = pkg_path + '/hyphen_dict_info.pickle'
        if os.path.exists(reg_file):
            os.remove(reg_file)

        # Install dictionaries
        if '--no_dictionaries' not in sys.argv:
            from hyphen.dictools import install
            print('Installing dictionaries... en_US ...')
            install('en_US')

            # Install dict for local language if needed
            try:
                locale.setlocale(locale.LC_ALL, '')
                local_lang = locale.getlocale()[0]
                # Install local dict only if locale has been read (= is not None)
                # and local_lang is not en_US.
                if local_lang and local_lang != 'en_US':
                    print(local_lang + ' ')
                    install(local_lang)
                    print('Done.')
            except Exception:
                warn('Could not install dictionary for local language.')

    except ImportError:
예제 #12
0
        
        # Write the new config.py
        codecs.open(mod_path, 'w', 'utf8').write(new_content)
        py_compile.compile(mod_path)
        print("Done.")
        
        # Delete any existing dict registry file
        reg_file = pkg_path + '/hyphen_dict_info.pickle'
        if os.path.exists(reg_file):
            os.remove(reg_file)

        # Install dictionaries
        if '--no_dictionaries' not in sys.argv:
            from hyphen.dictools import install
            print('Installing dictionaries... en_US ...')
            install('en_US')
            
            # Install dict for local language if needed
            try:
                locale.setlocale(locale.LC_ALL, '')
                local_lang = locale.getlocale()[0]
                # Install local dict only if locale has been read (= is not None)
                # and local_lang is not en_US.
                if local_lang and local_lang != 'en_US':
                    print(local_lang + ' ')
                    install(local_lang)
                    print('Done.')
            except Exception:
                warn('Could not install dictionary for local language.')

            
예제 #13
0
parser.add_argument('--output_h5', default='data/tiny-shakespeare.h5')
parser.add_argument('--output_json', default='data/tiny-shakespeare.json')
parser.add_argument('--val_frac', type=float, default=0.1)
parser.add_argument('--test_frac', type=float, default=0.1)
parser.add_argument('--quiet', action='store_true')
parser.add_argument('--syllabic', default='none')
parser.add_argument('--install_syllabic_dict', default='none')
parser.add_argument('--encoding', default='utf-8')
args = parser.parse_args()

if __name__ == '__main__':
    if args.encoding == 'bytes': args.encoding = None

    if args.install_syllabic_dict != 'none':
        from hyphen import dictools
        dictools.install(args.install_syllabic_dict)
        sys.exit(0)

    # First go the file once to see how big it is and to build the vocab
    if args.syllabic == 'none':
        syllabic = False
        token_to_idx = {}
        total_size = 0
        with codecs.open(args.input_txt, 'r', args.encoding) as f:
            for line in f:
                total_size += len(line)
                for char in line:
                    if char not in token_to_idx:
                        token_to_idx[char] = len(token_to_idx) + 1
    else:
        syllabic = True