def __init__ (self, language="EN", minWordLength=4, quality=8, hyphenDir=None, **options ): ExplicitHyphenator.__init__(self,language=language,minWordLength=minWordLength,**options) if hyphenDir is None: hyphenDir = os.path.join(os.path.split(__file__)[0], "..", "dict") fname = os.path.join(hyphenDir, "hyph_%s.dic" % language) if not dictools.is_installed(language, directory=hyphenDir): dictools.install(language, directory=hyphenDir) print "installed dictionary for %s into %s" % (language, hyphenDir) self.hnj = pyhyphen.hyphenator(language, directory=hyphenDir) self.quality = quality
def __init__(self, language="EN", minWordLength=4, quality=8, hyphenDir=None, **options): ExplicitHyphenator.__init__(self, language=language, minWordLength=minWordLength, **options) if hyphenDir is None: hyphenDir = os.path.join(os.path.split(__file__)[0], "..", "dict") fname = os.path.join(hyphenDir, "hyph_%s.dic" % language) if not dictools.is_installed(language, directory=hyphenDir): dictools.install(language, directory=hyphenDir) print "installed dictionary for %s into %s" % (language, hyphenDir) self.hnj = pyhyphen.hyphenator(language, directory=hyphenDir) self.quality = quality
def __init__(self, language="EN", minWordLength=4, quality=8, hyphenDir=None, purePython=False, **options): """ Note: The purePython version does NOT use Knuth's algorithm, but a more simple (and slower) algorithm. """ ExplicitHyphenator.__init__(self, language=language, minWordLength=minWordLength, **options) if hyphenDir is None: hyphenDir = os.path.join(os.path.split(__file__)[0], "dict") self.purePython = purePython fname = os.path.join(hyphenDir, "hyph_%s.dic" % language) # first line is set of characters, all other lines are patterns if self.purePython: # Note: we do not use a TRIE, we just store the patterns in a dict string:codes lines = open(fname).read().splitlines() self.characters = lines.pop(0) self.patterns = {} for pattern in lines: pat = "" codes = "" digit = "0" for ch in pattern: if ch >= '0' and ch <= '9': digit = ch else: codes = codes + digit pat = pat + ch digit = "0" codes = codes + digit self.patterns[pat.decode("iso-8859-1")] = codes else: import reportlab.lib.pyHnj as pyHnj self.hnj = pyHnj.Hyphen(fname) self.quality = quality
def __init__ (self, language="EN", minWordLength=4, quality=8, hyphenDir=None, purePython=False, **options ): """ Note: The purePython version does NOT use Knuth's algorithm, but a more simple (and slower) algorithm. """ ExplicitHyphenator.__init__(self,language=language,minWordLength=minWordLength, **options) if hyphenDir is None: hyphenDir = os.path.join(os.path.split(__file__)[0], "dict") self.purePython = purePython fname = os.path.join(hyphenDir, "hyph_%s.dic" % language) # first line is set of characters, all other lines are patterns if self.purePython: # Note: we do not use a TRIE, we just store the patterns in a dict string:codes lines = open(fname).read().splitlines() self.characters = lines.pop(0) self.patterns = {} for pattern in lines: pat = "" codes = "" digit = "0" for ch in pattern: if ch>='0' and ch<='9': digit = ch else: codes = codes+digit pat = pat+ch digit = "0" codes = codes+digit self.patterns[pat.decode("iso-8859-1")] = codes else: import pyHnj self.hnj = pyHnj.Hyphen(fname) self.quality = quality
def i_hyphenate(self, aWord): return ExplicitHyphenator.i_hyphenate_derived(self, aWord)
def __init__(self, language="DE", minWordLength=4, qHaupt=8, qNeben=5, qVorsilbe=5, qSchlecht=3, hyphenDir=None, **options): ExplicitHyphenator.__init__(self, language=language, minWordLength=minWordLength, **options) # Qualitäten für verschiedene Trennstellen self.qHaupt = qHaupt self.qNeben = qNeben self.qVorsilbe = qVorsilbe self.qSchlecht = qSchlecht # Stammdaten initialisieren special_words = [] self.roots = [] self.prefixes = [] self.suffixes = [] self.prefix_chars = DEhyph.prefix_chars self.suffix_chars = DEhyph.suffix_chars self.maxLevel = 20 # Statistikdaten initialisieren self.numStatesExamined = 0 # [special_words] einlesen for zeile in DEhyph.special_words.splitlines(): # Leerzeilen und Kommentare überspringen zeile = zeile.strip() if not zeile or zeile.startswith("#"): continue if "=" in zeile: word, trennung = zeile.split("=") else: zeile = zeile.split(",") word = zeile.pop(0) assert len(zeile) >= 1 for attr in zeile: if ":" in attr: propnam, propval = attr.split(":") else: propnam, propval = attr, "" if propnam == u"TRENNUNG": trennung = propval elif propnam == u"KEEP_TOGETHER": trennung = word else: raise NameError("Unknown property for word %s: %s" % (word, propnam)) pass # Attribut ignorieren self.add_entry(word, trennung) # roots, prefixes und suffixes einlesen. # Bei diesen können noch - Komma-getrennt - Eigenschaften angegeben sein. # Eine Eigenschaft hat die Form XXX oder XXX:a,b,c for name in ["roots", "prefixes", "suffixes"]: abschnitt = getattr(self, name) zeilen = getattr(DEhyph, name) assert isinstance(zeilen, unicode) for zeile in zeilen.splitlines(): # Leerzeilen und Kommentare überspringen zeile = zeile.strip() if not zeile or zeile.startswith("#"): continue # Aufteilen in word und props zeile = zeile.split(",") word = zeile.pop(0) props = [] if len(zeile) >= 1: for attr in zeile: if ":" in attr: [propnam, propval] = attr.split(":") else: propnam = attr propval = "" try: cls = RULES[propnam] props.append( cls(propval)) # the class is the propnam except KeyError: raise NameError( "Unknown property for word %s: %s" % (word, propnam)) # Jeder abschnitt ist eine Liste von Tupeln (lae, L), wobei L # ein Dictionary von Wörtern der Länge lae ist und dazu die Liste # der möglichen Eigenschaften enthält (dasselbe Wort kann je nach # Bedeutung unterschiedliche Eigenschaften haben). lenword = len(word) for (lae, L) in abschnitt: if lae == lenword: try: L[word].append(props) except KeyError: L[word] = [props] break else: abschnitt.append((lenword, {word: [props]})) self.stripper = Stripper(self.prefix_chars, self.suffix_chars)