def hyphenate(value, arg=None, autoescape=None): if autoescape: esc = conditional_escape else: esc = lambda x: x if arg: args = arg.split(u',') code = args[0] if len(args) > 1: minlen = int(args[1]) else: minlen = 5 else: code = settings.LANGUAGE_CODE s = code.split(u'-') lang = s[0].lower() + u'_' + s[1].upper() if not dictools.is_installed(lang): dictools.install(lang) h = hyphenator(lang) new = [] for word in value.split(u' '): if len(word) > minlen and word.isalpha(): new.append(u'­'.join(h.syllables(word))) else: new.append(word) result = u' '.join(new) return mark_safe(result)
def hyphenate(value, arg=None, autoescape=None): # Default minimal length minlen = 6 if arg: args = arg.split(u',') code = args[0] # Override minimal length, if specified if len(args) > 1: minlen = int(args[1]) else: # No language specified, use Django's current code = get_language() # Normalize the locale code, ignoring a potential encoding suffix lang = locale.normalize(code).split('.')[0] # Make sure the proper language is installed if not dictools.is_installed(lang): dictools.install(lang) h = Hyphenator(lang) new = [] for word in value.split(u' '): if len(word) > minlen and word.isalpha(): new.append(u'­'.join(h.syllables(word))) else: new.append(word) result = u' '.join(new) return mark_safe(result)
def syllablize(poem): # syllablizer setup if not is_installed(language): install(language) hyph = Hyphenator(language) # output dict to send back through API output = [] for line in poem: # list of words in line words = line.split() syllablized_line = [] for word in words: syls = hyph.syllables(word) new_word = "" if len(syls) == 0: new_word = word else: for syl in syls: new_word += syl new_word += " " syllablized_line.append(new_word.strip()) if len(syllablized_line) > 0: output.append(syllablized_line) return output
def by_syllable(input_gen, lang, install_lang_p): if install_lang_p and not dictools.is_installed(lang): dictools.install(lang) hyphenator = Hyphenator(lang) for word in input_gen: syllables = hyphenator.syllables(word) logging.debug("syllables: {}".format(syllables)) for syllable in syllables: yield syllable
def syllable_pos_setup(self): """Sets up syllables and POS tagging""" en_list = ['en_CA', 'en_PH', 'en_NA', 'en_NZ', 'en_JM', 'en_BS', 'en_US', 'en_IE', 'en_MW', 'en_IN', 'en_BZ', 'en_TT', 'en_ZA', 'en_AU', 'en_GH', 'en_ZW', 'en_GB'] for lang in en_list: if not dictools.is_installed(lang): dictools.install(lang) self.cmu_dict = cmudict.dict() # sets up POS try: nltk.pos_tag(['test']) self.pos_tag = nltk.pos_tag except urllib2.URLError: PICKLE = "averaged_perceptron_tagger.pickle" AP_MODEL_LOC = 'file:' + str(find('taggers/averaged_perceptron_tagger/' + PICKLE)) tagger = PerceptronTagger(load=False) tagger.load(AP_MODEL_LOC) self.pos_tag = tagger.tag self.tag_dict = {'NN': 'Noun', 'FW': 'Noun', 'JJ': 'Adjective', 'VB': 'Verb', 'IN': 'Preposition', 'CC': 'Conjunction', 'RP': 'Connector', 'TO': 'Connector', 'MD': 'Connector', 'RB': 'Adverb', 'WR': 'Wh-adverb', 'DT': 'DetPro', 'WD': 'DetPro', 'PD': 'DetPro', 'PR': 'DetPro', 'WP': 'DetPro', 'CD': 'Cardinal', 'EX': 'Existential there'} ## self.tag_dict={'NN':'Noun', 'JJ':'Adjective','RB':'Adverb','VB':'Verb', ## 'IN':'Preposition','PR':'Pronoun','CC':'Conjunction', ## 'RP':'Particle','WR':'Wh-adverb','DT':'Determiner', ## 'TO':'To','MD':'Modal Aux','CD':'Cardinal', 'PD':'Predeterminer', ## 'WD':'Wh-determiner', 'WP':'Wh-pronoun','EX':'Existential there'} # POS which are allowed to happen twice in a row self.pos_double = [] # ['Noun','Adjective'] # POS which can only occur sequentially # i.e. an Adverb must occur in fron of a verb self.pos_lead = {'Adverb': ['Verb'], 'Pronoun': ['Noun'], 'Adjective': ['Noun'], 'Preposition': ['Noun', 'Pronoun']} # POS which cannot occur sequentially # i.e. a preposition cannot come before a verb self.pos_restrict_lead = {'Preposition': 'Verb',} return
def hyphenate(value, arg=None, autoescape=None): if autoescape: esc = conditional_escape else: esc = lambda x: x minlen = 7 if arg: args = arg.split(u',') code = args[0] if len(args) > 1: minlen = int(args[1]) else: code = settings.LANGUAGE_CODE # # Looks like this is assuming that the language code will arrive as 'xx- # YY'. In our case, it will arrive as simply 'en', so we MUST expand this # into a locale in order to work with PyHyphen. # # TODO: This should probably be a lookup against a dict in settings? s = code.split(u'-') if len(s) == 1: if s[0] == 'en': s.append(u'US') elif s[0] == 'bg': s.append(u'BG') lang = s[0].lower() + u'_' + s[1].upper() if not dictools.is_installed(lang): dictools.install(lang) h = Hyphenator(lang) new = [] for word in value.split(u' '): if len(word) > minlen and word.isalpha(): new.append(u'­'.join(h.syllables(word))) else: new.append(word) result = u' '.join(new) return mark_safe(result)
def _set_lang_dict(self): if self.dict_download: try: if not is_installed(self.lang_code): if self.verbose: print(Msg.DICT_INSTALL(self.lang_code)) install(self.lang_code) self.lang_dict = Hyphenator(self.lang_code) except: pass if self.verbose: if is_installed(self.lang_code): print(Msg.DICT_INSTALLED(self.lang_code)) else: print(Msg.DICT_INSTALL_FAILED(self.lang_code))
def __init__ (self, language="EN", minWordLength=4, quality=8, hyphenDir=None, **options ): ExplicitHyphenator.__init__(self,language=language,minWordLength=minWordLength,**options) if hyphenDir is None: hyphenDir = os.path.join(os.path.split(__file__)[0], "..", "dict") fname = os.path.join(hyphenDir, "hyph_%s.dic" % language) if not dictools.is_installed(language, directory=hyphenDir): dictools.install(language, directory=hyphenDir) print "installed dictionary for %s into %s" % (language, hyphenDir) self.hnj = pyhyphen.hyphenator(language, directory=hyphenDir) self.quality = quality
def __init__(self, language="EN", minWordLength=4, quality=8, hyphenDir=None, **options): ExplicitHyphenator.__init__(self, language=language, minWordLength=minWordLength, **options) if hyphenDir is None: hyphenDir = os.path.join(os.path.split(__file__)[0], "..", "dict") fname = os.path.join(hyphenDir, "hyph_%s.dic" % language) if not dictools.is_installed(language, directory=hyphenDir): dictools.install(language, directory=hyphenDir) print "installed dictionary for %s into %s" % (language, hyphenDir) self.hnj = pyhyphen.hyphenator(language, directory=hyphenDir) self.quality = quality
import os import json import re from collections import OrderedDict from bs4 import BeautifulSoup from hyphen import Hyphenator, dictools from flask import current_app if not dictools.is_installed("ru_RU"): dictools.install("ru_RU") RU_HYPHENATOR = Hyphenator("ru_RU") SOFT_HYPHEN = u"\u00AD" STRIP_WHITESPACE = re.compile("\w+", re.MULTILINE | re.UNICODE) HYPHENATOR_BLACKLIST_TAGS = ("code", "tt", "pre", "head", "title", "script", "style", "meta", "object", "embed", "samp", "var", "math", "select", "option", "input", "textarea", "span", "iframe") def no_shy(text): return text.replace(SOFT_HYPHEN, "") def get_data(module): index_path = os.path.join(current_app.root_path, "data", module, "posts.json") with open(index_path, "r", encoding="utf8") as index_file: return OrderedDict([(p[0], p) for p in json.load(index_file)])
# Write the new config.py codecs.open(mod_path, 'w', 'utf8').write(new_content) py_compile.compile(mod_path) print("Done.") # Delete any existing dict registry file reg_file = pkg_path + '/hyphen_dict_info.pickle' if os.path.exists(reg_file): os.remove(reg_file) # Install dictionaries if '--no_dictionaries' not in sys.argv: from hyphen.dictools import install print('Installing dictionaries... en_US ...') install('en_US') # Install dict for local language if needed try: locale.setlocale(locale.LC_ALL, '') local_lang = locale.getlocale()[0] # Install local dict only if locale has been read (= is not None) # and local_lang is not en_US. if local_lang and local_lang != 'en_US': print(local_lang + ' ') install(local_lang) print('Done.') except Exception: warn('Could not install dictionary for local language.') except ImportError:
# Write the new config.py codecs.open(mod_path, 'w', 'utf8').write(new_content) py_compile.compile(mod_path) print("Done.") # Delete any existing dict registry file reg_file = pkg_path + '/hyphen_dict_info.pickle' if os.path.exists(reg_file): os.remove(reg_file) # Install dictionaries if '--no_dictionaries' not in sys.argv: from hyphen.dictools import install print('Installing dictionaries... en_US ...') install('en_US') # Install dict for local language if needed try: locale.setlocale(locale.LC_ALL, '') local_lang = locale.getlocale()[0] # Install local dict only if locale has been read (= is not None) # and local_lang is not en_US. if local_lang and local_lang != 'en_US': print(local_lang + ' ') install(local_lang) print('Done.') except Exception: warn('Could not install dictionary for local language.')
parser.add_argument('--output_h5', default='data/tiny-shakespeare.h5') parser.add_argument('--output_json', default='data/tiny-shakespeare.json') parser.add_argument('--val_frac', type=float, default=0.1) parser.add_argument('--test_frac', type=float, default=0.1) parser.add_argument('--quiet', action='store_true') parser.add_argument('--syllabic', default='none') parser.add_argument('--install_syllabic_dict', default='none') parser.add_argument('--encoding', default='utf-8') args = parser.parse_args() if __name__ == '__main__': if args.encoding == 'bytes': args.encoding = None if args.install_syllabic_dict != 'none': from hyphen import dictools dictools.install(args.install_syllabic_dict) sys.exit(0) # First go the file once to see how big it is and to build the vocab if args.syllabic == 'none': syllabic = False token_to_idx = {} total_size = 0 with codecs.open(args.input_txt, 'r', args.encoding) as f: for line in f: total_size += len(line) for char in line: if char not in token_to_idx: token_to_idx[char] = len(token_to_idx) + 1 else: syllabic = True