Пример #1
0
 def __init__(self, errorHandler=None):
     self.lexemes = []
     self.paradigms = {}         # name -> Paradigm object
     self.lexByParadigm = {}     # paradigm name -> links to sublexemes which
                                 # have that paradigm in the form (lex, subLex)
     self.afxTables = {}         # name -> AfxTable object
     self.pfxTable = AfxTable()  # the single prefix table
     self.pfxTable.name = u'pfx'
     self.stemConversions = []
     self.derivations = {}
     if errorHandler is None:
         self.errorHandler = ErrorHandler()
     else:
         self.errorHandler = errorHandler
     Lexeme.g = self
     Inflexion.g = self
     Paradigm.g = self
     Derivation.g = self
Пример #2
0
class Grammar:
    """The main class of the project."""

    RECURS_LIMIT = 2
    DERIV_LIMIT = 4             # counts only non-empty derivands
    TOTAL_DERIV_LIMIT = 10      # counts everything
    LEX_MEMORY_LIMIT = 1024 * 1024 * 1024 # Memory limit for the lexeme list
    PARADIGM_MEMORY_LIMIT = 1024 * 1024 * 1024 # Memory limit for the paradigm list
    
    def __init__(self, errorHandler=None):
        self.lexemes = []
        self.paradigms = {}         # name -> Paradigm object
        self.lexByParadigm = {}     # paradigm name -> links to sublexemes which
                                    # have that paradigm in the form (lex, subLex)
        self.afxTables = {}         # name -> AfxTable object
        self.pfxTable = AfxTable()  # the single prefix table
        self.pfxTable.name = u'pfx'
        self.stemConversions = []
        self.derivations = {}
        if errorHandler is None:
            self.errorHandler = ErrorHandler()
        else:
            self.errorHandler = errorHandler
        Lexeme.g = self
        Inflexion.g = self
        Paradigm.g = self
        Derivation.g = self

    def load_stem_conversions(self, fname):
        """Load stem conversion rules from a file.
        Return the number of rules loaded."""
        if len(self.lexemes) > 0:
            self.raise_error(u'Loading stem conversions should occur before ' +\
                             u'loading stems.')
            return 0
        conversionDescrs = yamlReader.read_file(fname, self.errorHandler)
        self.stemConversions = {} # {conversion name -> StemConversion}
        for dictSC in conversionDescrs:
            sc = StemConversion(dictSC, self.errorHandler)
            self.stemConversions[sc.name] = sc
        return len(self.stemConversions)

    def load_paradigms(self, fname, pLst=None):
        """Load paradigms from a file.
        Return the number of paradigms loaded."""
        if len(self.lexemes) > 0:
            self.raise_error(u'Loading paradigms should occur before ' +\
                             u'loading stems.')
            return 0
        paraDescrs = yamlReader.read_file(fname, self.errorHandler)
        for dictDescr in paraDescrs:
            if sys.getsizeof(self.paradigms) > self.PARADIGM_MEMORY_LIMIT:
                self.raise_error(u'Not enough memory for the paradigms.')
                return
            self.paradigms[dictDescr[u'value']] =\
                                    Paradigm(dictDescr, self.errorHandler)
        newParadigms = {}
        for pName, p in self.paradigms.iteritems():
            if pLst is None or pName in pLst:
                p = copy.deepcopy(p)
                p.compile_paradigm()
                newParadigms[pName] = p
        self.paradigms = newParadigms
        return len(self.paradigms)

    def load_lexemes(self, fname):
        lexDescrs = yamlReader.read_file(fname, self.errorHandler)
        for dictDescr in lexDescrs:
            if sys.getsizeof(self.lexemes) > self.LEX_MEMORY_LIMIT:
                self.raise_error(u'Not enough memory for the lexemes.')
                return
            self.lexemes.append(Lexeme(dictDescr, self.errorHandler))
        return len(self.lexemes)

    def load_derivations(self, fname):
        """Load derivations from a file.
        Return the number of derivations loaded."""
        derivDescrs = yamlReader.read_file(fname, self.errorHandler)
        for dictDescr in derivDescrs:
##            self.derivations[u'#deriv#' + dictDescr[u'value']] =\
##                                    Paradigm(dictDescr, self.errorHandler)
            dictDescr[u'value'] = u'#deriv#' + dictDescr[u'value']
            self.derivations[dictDescr[u'value']] =\
                                    Derivation(dictDescr, self.errorHandler)
        for paradigm in self.paradigms.values():
            derivations.deriv_for_paradigm(paradigm)
        for derivName, deriv in self.derivations.iteritems():
            if derivName.startswith(u'#deriv#paradigm#'):
                deriv.build_links()
                print derivName + u': build complete.'
                #print unicode(self.derivations[u'#deriv#paradigm#Nctt'])
                deriv.extend_leaves()
                print derivName + u': leaves extended.'
                #print unicode(deriv)
        #print unicode(self.derivations[u'#deriv#N-fӕ#paradigm#Nct'])
        print u'Leaves extended.'
        #print unicode(self.derivations[u'#deriv#paradigm#Nct'])
        
        for derivName, deriv in self.derivations.iteritems():
            p = deriv.to_paradigm()
            self.paradigms[derivName] = p
        for derivName in self.derivations:
            print u'Compiling ' + derivName + u'... ',
            self.paradigms[derivName].compile_paradigm()
            print u'compiled.'
            gc.collect()
            if derivName == u'#deriv#paradigm#Nctt':
                fPara = codecs.open(u'test-ossetic/deriv-Nctt-test.txt', 'w', 'utf-8-sig')
                for f in self.paradigms[derivName].flex:
                    fPara.write(unicode(f))
                fPara.close()
        print u'Derivations compiled.'
        for lex in self.lexemes:
            lex.add_derivations()
        return len(self.derivations)

    def compile_all(self):
        for lex in self.lexemes:
            lex.generate_redupl_paradigm()
            lex.generate_regex_paradigm()
            if sys.getsizeof(self.paradigms) > self.PARADIGM_MEMORY_LIMIT:
                self.raise_error(u'Not enough memory for the paradigms.')
                return
            for sl in lex.subLexemes:
                try:
                    self.lexByParadigm[sl.paradigm].append((lex, sl))
                except KeyError:
                    self.lexByParadigm[sl.paradigm] = [(lex, sl)]

    def build_afx_tables(self):
        """Build the tables of prefixes, stems+infixes, and suffixes."""
        for paraName in self.paradigms:
            try:
                curLexemes = self.lexByParadigm[paraName]
                tableStems = self.paradigms[paraName].get_stems(curLexemes,
                                                                u'sfx=' + paraName)
                tableStems.name = u'stems=' + paraName
                #print u'Stem table: ' + tableStems.name
            except KeyError:
                continue    # no lexemes with that paradigm
            tablePfx = self.paradigms[paraName].get_pfx(tableStems.name)
            self.pfxTable += tablePfx
            tableSfx = self.paradigms[paraName].get_sfx()
            tableSfx.name = u'sfx=' + paraName
            self.afxTables[tableStems.name] = tableStems
            self.afxTables[tableSfx.name] = tableSfx

    def clean_unused_feature_sets(self):
        """Empty all feature sets in the FeatureSetLink class for which
        there is no active link."""
        usedGrSets = set()
        usedLexSets = set()
        usedLinkSets = set()
        for t in self.afxTables.values():
            for afx in t.afxs.values():
                for fsl in afx.fsLinks:
                    usedGrSets.add(fsl.grSetNum)
                    usedLexSets.add(fsl.lexSetNum)
                    usedLinkSets.add(fsl.linkSetNum)
        for afx in self.pfxTable.afxs.values():
            for fsl in afx.fsLinks:
                usedGrSets.add(fsl.grSetNum)
                usedLexSets.add(fsl.lexSetNum)
                usedLinkSets.add(fsl.linkSetNum)
        for iFS in range(len(FeatureSetLink.grFeatureSets)):
            if iFS not in usedGrSets:
                FeatureSetLink.grFeatureSets[iFS] = set()
        for iFS in range(len(FeatureSetLink.lexFeatureSets)):
            if iFS not in usedLexSets:
                FeatureSetLink.lexFeatureSets[iFS] = set()
        for iFS in range(len(FeatureSetLink.linkSets)):
            if iFS not in usedLinkSets:
                FeatureSetLink.linkSets[iFS] = set()
    
    def write_afx_tables(self, fname):
        # Delete the contents of the file
        f = codecs.open(fname, 'w', 'utf-8-sig')
        f.close()
        self.clean_unused_feature_sets()
        write_feature_sets(fname)
        self.pfxTable.write(fname)
        for t in self.afxTables:
            self.afxTables[t].write(fname)