def __init__ (self, 
               language="EN",
               minWordLength=4,
               quality=8,
               hyphenDir=None,
               **options
              ):
     ExplicitHyphenator.__init__(self,language=language,minWordLength=minWordLength,**options)
     if hyphenDir is None:
         hyphenDir = os.path.join(os.path.split(__file__)[0], "..", "dict")
     fname = os.path.join(hyphenDir, "hyph_%s.dic" % language)
     if not dictools.is_installed(language, directory=hyphenDir):
         dictools.install(language, directory=hyphenDir)
         print "installed dictionary for %s into %s" % (language, hyphenDir)
     self.hnj = pyhyphen.hyphenator(language, directory=hyphenDir)
     self.quality = quality
示例#2
0
 def __init__(self,
              language="EN",
              minWordLength=4,
              quality=8,
              hyphenDir=None,
              **options):
     ExplicitHyphenator.__init__(self,
                                 language=language,
                                 minWordLength=minWordLength,
                                 **options)
     if hyphenDir is None:
         hyphenDir = os.path.join(os.path.split(__file__)[0], "..", "dict")
     fname = os.path.join(hyphenDir, "hyph_%s.dic" % language)
     if not dictools.is_installed(language, directory=hyphenDir):
         dictools.install(language, directory=hyphenDir)
         print "installed dictionary for %s into %s" % (language, hyphenDir)
     self.hnj = pyhyphen.hyphenator(language, directory=hyphenDir)
     self.quality = quality
示例#3
0
 def __init__(self,
              language="EN",
              minWordLength=4,
              quality=8,
              hyphenDir=None,
              purePython=False,
              **options):
     """ Note:
         The purePython version does NOT use Knuth's algorithm,
         but a more simple (and slower) algorithm.
     """
     ExplicitHyphenator.__init__(self,
                                 language=language,
                                 minWordLength=minWordLength,
                                 **options)
     if hyphenDir is None:
         hyphenDir = os.path.join(os.path.split(__file__)[0], "dict")
     self.purePython = purePython
     fname = os.path.join(hyphenDir, "hyph_%s.dic" % language)
     # first line is set of characters, all other lines are patterns
     if self.purePython:
         # Note: we do not use a TRIE, we just store the patterns in a dict string:codes
         lines = open(fname).read().splitlines()
         self.characters = lines.pop(0)
         self.patterns = {}
         for pattern in lines:
             pat = ""
             codes = ""
             digit = "0"
             for ch in pattern:
                 if ch >= '0' and ch <= '9':
                     digit = ch
                 else:
                     codes = codes + digit
                     pat = pat + ch
                     digit = "0"
             codes = codes + digit
             self.patterns[pat.decode("iso-8859-1")] = codes
     else:
         import reportlab.lib.pyHnj as pyHnj
         self.hnj = pyHnj.Hyphen(fname)
     self.quality = quality
 def __init__ (self, 
               language="EN",
               minWordLength=4,
               quality=8,
               hyphenDir=None,
               purePython=False,
               **options
              ):
     """ Note:
         The purePython version does NOT use Knuth's algorithm,
         but a more simple (and slower) algorithm.
     """
     ExplicitHyphenator.__init__(self,language=language,minWordLength=minWordLength, **options)
     if hyphenDir is None:
         hyphenDir = os.path.join(os.path.split(__file__)[0], "dict")
     self.purePython = purePython
     fname = os.path.join(hyphenDir, "hyph_%s.dic" % language)
     # first line is set of characters, all other lines are patterns
     if self.purePython:
         # Note: we do not use a TRIE, we just store the patterns in a dict string:codes
         lines = open(fname).read().splitlines()
         self.characters = lines.pop(0)
         self.patterns = {}
         for pattern in lines:
             pat = ""
             codes = ""
             digit = "0"
             for ch in pattern:
                 if ch>='0' and ch<='9':
                     digit = ch
                 else:
                     codes = codes+digit
                     pat = pat+ch
                     digit = "0"
             codes = codes+digit
             self.patterns[pat.decode("iso-8859-1")] = codes
     else:
         import pyHnj
         self.hnj = pyHnj.Hyphen(fname)
     self.quality = quality
示例#5
0
    def __init__(self,
                 language="DE",
                 minWordLength=4,
                 qHaupt=8,
                 qNeben=5,
                 qVorsilbe=5,
                 qSchlecht=3,
                 hyphenDir=None,
                 **options):
        ExplicitHyphenator.__init__(self,
                                    language=language,
                                    minWordLength=minWordLength,
                                    **options)

        # Qualitäten für verschiedene Trennstellen
        self.qHaupt = qHaupt
        self.qNeben = qNeben
        self.qVorsilbe = qVorsilbe
        self.qSchlecht = qSchlecht

        # Stammdaten initialisieren
        special_words = []
        self.roots = []
        self.prefixes = []
        self.suffixes = []
        self.prefix_chars = DEhyph.prefix_chars
        self.suffix_chars = DEhyph.suffix_chars
        self.maxLevel = 20

        # Statistikdaten initialisieren
        self.numStatesExamined = 0

        # [special_words] einlesen
        for zeile in DEhyph.special_words.splitlines():
            # Leerzeilen und Kommentare überspringen
            zeile = zeile.strip()
            if not zeile or zeile.startswith("#"):
                continue
            if "=" in zeile:
                word, trennung = zeile.split("=")
            else:
                zeile = zeile.split(",")
                word = zeile.pop(0)
                assert len(zeile) >= 1
                for attr in zeile:
                    if ":" in attr:
                        propnam, propval = attr.split(":")
                    else:
                        propnam, propval = attr, ""
                    if propnam == u"TRENNUNG":
                        trennung = propval
                    elif propnam == u"KEEP_TOGETHER":
                        trennung = word
                    else:
                        raise NameError("Unknown property for word %s: %s" %
                                        (word, propnam))
                        pass  # Attribut ignorieren
            self.add_entry(word, trennung)

        # roots, prefixes und suffixes einlesen.
        # Bei diesen können noch - Komma-getrennt - Eigenschaften angegeben sein.
        # Eine Eigenschaft hat die Form XXX oder XXX:a,b,c
        for name in ["roots", "prefixes", "suffixes"]:
            abschnitt = getattr(self, name)
            zeilen = getattr(DEhyph, name)
            assert isinstance(zeilen, unicode)
            for zeile in zeilen.splitlines():
                # Leerzeilen und Kommentare überspringen
                zeile = zeile.strip()
                if not zeile or zeile.startswith("#"):
                    continue
                # Aufteilen in word und props
                zeile = zeile.split(",")
                word = zeile.pop(0)
                props = []
                if len(zeile) >= 1:
                    for attr in zeile:
                        if ":" in attr:
                            [propnam, propval] = attr.split(":")
                        else:
                            propnam = attr
                            propval = ""
                        try:
                            cls = RULES[propnam]
                            props.append(
                                cls(propval))  # the class is the propnam
                        except KeyError:
                            raise NameError(
                                "Unknown property for word %s: %s" %
                                (word, propnam))
                # Jeder abschnitt ist eine Liste von Tupeln (lae, L), wobei L
                # ein Dictionary von Wörtern der Länge lae ist und dazu die Liste
                # der möglichen Eigenschaften enthält (dasselbe Wort kann je nach
                # Bedeutung unterschiedliche Eigenschaften haben).
                lenword = len(word)
                for (lae, L) in abschnitt:
                    if lae == lenword:
                        try:
                            L[word].append(props)
                        except KeyError:
                            L[word] = [props]
                        break
                else:
                    abschnitt.append((lenword, {word: [props]}))
        self.stripper = Stripper(self.prefix_chars, self.suffix_chars)